LLVM  15.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
15 #include "AArch64ExpandImm.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
35 #include "llvm/CodeGen/Analysis.h"
51 #include "llvm/IR/Attributes.h"
52 #include "llvm/IR/Constants.h"
53 #include "llvm/IR/DataLayout.h"
54 #include "llvm/IR/DebugLoc.h"
55 #include "llvm/IR/DerivedTypes.h"
56 #include "llvm/IR/Function.h"
58 #include "llvm/IR/GlobalValue.h"
59 #include "llvm/IR/IRBuilder.h"
60 #include "llvm/IR/Instruction.h"
61 #include "llvm/IR/Instructions.h"
62 #include "llvm/IR/IntrinsicInst.h"
63 #include "llvm/IR/Intrinsics.h"
64 #include "llvm/IR/IntrinsicsAArch64.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/OperandTraits.h"
67 #include "llvm/IR/PatternMatch.h"
68 #include "llvm/IR/Type.h"
69 #include "llvm/IR/Use.h"
70 #include "llvm/IR/Value.h"
71 #include "llvm/MC/MCRegisterInfo.h"
72 #include "llvm/Support/Casting.h"
73 #include "llvm/Support/CodeGen.h"
75 #include "llvm/Support/Compiler.h"
76 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
84 #include <algorithm>
85 #include <bitset>
86 #include <cassert>
87 #include <cctype>
88 #include <cstdint>
89 #include <cstdlib>
90 #include <iterator>
91 #include <limits>
92 #include <tuple>
93 #include <utility>
94 #include <vector>
95 
96 using namespace llvm;
97 using namespace llvm::PatternMatch;
98 
99 #define DEBUG_TYPE "aarch64-lower"
100 
101 STATISTIC(NumTailCalls, "Number of tail calls");
102 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
103 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
104 
105 // FIXME: The necessary dtprel relocations don't seem to be supported
106 // well in the GNU bfd and gold linkers at the moment. Therefore, by
107 // default, for now, fall back to GeneralDynamic code generation.
109  "aarch64-elf-ldtls-generation", cl::Hidden,
110  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
111  cl::init(false));
112 
113 static cl::opt<bool>
114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
115  cl::desc("Enable AArch64 logical imm instruction "
116  "optimization"),
117  cl::init(true));
118 
119 // Temporary option added for the purpose of testing functionality added
120 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
121 // in future when both implementations will be based off MGATHER rather
122 // than the GLD1 nodes added for the SVE gather load intrinsics.
123 static cl::opt<bool>
124 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
125  cl::desc("Combine extends of AArch64 masked "
126  "gather intrinsics"),
127  cl::init(true));
128 
129 /// Value type used for condition codes.
130 static const MVT MVT_CC = MVT::i32;
131 
132 static inline EVT getPackedSVEVectorVT(EVT VT) {
133  switch (VT.getSimpleVT().SimpleTy) {
134  default:
135  llvm_unreachable("unexpected element type for vector");
136  case MVT::i8:
137  return MVT::nxv16i8;
138  case MVT::i16:
139  return MVT::nxv8i16;
140  case MVT::i32:
141  return MVT::nxv4i32;
142  case MVT::i64:
143  return MVT::nxv2i64;
144  case MVT::f16:
145  return MVT::nxv8f16;
146  case MVT::f32:
147  return MVT::nxv4f32;
148  case MVT::f64:
149  return MVT::nxv2f64;
150  case MVT::bf16:
151  return MVT::nxv8bf16;
152  }
153 }
154 
155 // NOTE: Currently there's only a need to return integer vector types. If this
156 // changes then just add an extra "type" parameter.
158  switch (EC.getKnownMinValue()) {
159  default:
160  llvm_unreachable("unexpected element count for vector");
161  case 16:
162  return MVT::nxv16i8;
163  case 8:
164  return MVT::nxv8i16;
165  case 4:
166  return MVT::nxv4i32;
167  case 2:
168  return MVT::nxv2i64;
169  }
170 }
171 
172 static inline EVT getPromotedVTForPredicate(EVT VT) {
174  "Expected scalable predicate vector type!");
175  switch (VT.getVectorMinNumElements()) {
176  default:
177  llvm_unreachable("unexpected element count for vector");
178  case 2:
179  return MVT::nxv2i64;
180  case 4:
181  return MVT::nxv4i32;
182  case 8:
183  return MVT::nxv8i16;
184  case 16:
185  return MVT::nxv16i8;
186  }
187 }
188 
189 /// Returns true if VT's elements occupy the lowest bit positions of its
190 /// associated register class without any intervening space.
191 ///
192 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
193 /// same register class, but only nxv8f16 can be treated as a packed vector.
194 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
195  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
196  "Expected legal vector type!");
197  return VT.isFixedLengthVector() ||
199 }
200 
201 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
202 // predicate and end with a passthru value matching the result type.
203 static bool isMergePassthruOpcode(unsigned Opc) {
204  switch (Opc) {
205  default:
206  return false;
235  return true;
236  }
237 }
238 
240  const AArch64Subtarget &STI)
241  : TargetLowering(TM), Subtarget(&STI) {
242  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
243  // we have to make something up. Arbitrarily, choose ZeroOrOne.
245  // When comparing vectors the result sets the different elements in the
246  // vector to all-one or all-zero.
248 
249  // Set up the register classes.
250  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
251  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
252 
253  if (Subtarget->hasLS64()) {
254  addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
257  }
258 
259  if (Subtarget->hasFPARMv8()) {
260  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
261  addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
262  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
263  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
264  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
265  }
266 
267  if (Subtarget->hasNEON()) {
268  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
269  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
270  // Someone set us up the NEON.
271  addDRTypeForNEON(MVT::v2f32);
272  addDRTypeForNEON(MVT::v8i8);
273  addDRTypeForNEON(MVT::v4i16);
274  addDRTypeForNEON(MVT::v2i32);
275  addDRTypeForNEON(MVT::v1i64);
276  addDRTypeForNEON(MVT::v1f64);
277  addDRTypeForNEON(MVT::v4f16);
278  if (Subtarget->hasBF16())
279  addDRTypeForNEON(MVT::v4bf16);
280 
281  addQRTypeForNEON(MVT::v4f32);
282  addQRTypeForNEON(MVT::v2f64);
283  addQRTypeForNEON(MVT::v16i8);
284  addQRTypeForNEON(MVT::v8i16);
285  addQRTypeForNEON(MVT::v4i32);
286  addQRTypeForNEON(MVT::v2i64);
287  addQRTypeForNEON(MVT::v8f16);
288  if (Subtarget->hasBF16())
289  addQRTypeForNEON(MVT::v8bf16);
290  }
291 
292  if (Subtarget->hasSVE() || Subtarget->hasSME()) {
293  // Add legal sve predicate types
294  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
295  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
296  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
297  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
298 
299  // Add legal sve data types
300  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
301  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
302  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
303  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
304 
305  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
306  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
307  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
308  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
309  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
310  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
311 
312  if (Subtarget->hasBF16()) {
313  addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
314  addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
315  addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
316  }
317 
318  if (Subtarget->useSVEForFixedLengthVectors()) {
320  if (useSVEForFixedLengthVectorVT(VT))
321  addRegisterClass(VT, &AArch64::ZPRRegClass);
322 
324  if (useSVEForFixedLengthVectorVT(VT))
325  addRegisterClass(VT, &AArch64::ZPRRegClass);
326  }
327  }
328 
329  // Compute derived properties from the register classes
331 
332  // Provide all sorts of operation actions
366 
370 
374 
376 
377  // Custom lowering hooks are needed for XOR
378  // to fold it into CSINC/CSINV.
381 
382  // Virtually no operation on f128 is legal, but LLVM can't expand them when
383  // there's a valid register class, so we need custom operations in most cases.
407  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
408  // aren't handled.
409 
410  // Lowering for many of the conversions is actually specified by the non-f128
411  // type. The LowerXXX function will be trivial when f128 isn't involved.
442 
447 
448  // Variable arguments.
453 
454  // Variable-sized objects.
457 
458  if (Subtarget->isTargetWindows())
460  else
462 
463  // Constant pool entries
465 
466  // BlockAddress
468 
469  // AArch64 lacks both left-rotate and popcount instructions.
472  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
475  }
476 
477  // AArch64 doesn't have i32 MULH{S|U}.
480 
481  // AArch64 doesn't have {U|S}MUL_LOHI.
484 
488 
491 
494  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
497  }
504 
505  // Custom lower Add/Sub/Mul with overflow.
518 
527 
536  if (Subtarget->hasFullFP16())
538  else
540 
541  for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
551  }
552 
553  if (!Subtarget->hasFullFP16()) {
554  for (auto Op :
570 
571  // Round-to-integer need custom lowering for fp16, as Promote doesn't work
572  // because the result type is integer.
576 
577  // promote v4f16 to v4f32 when that is known to be safe.
586 
603 
625  }
626 
627  // AArch64 has implementations of a lot of rounding-like FP operations.
628  for (auto Op :
639  for (MVT Ty : {MVT::f32, MVT::f64})
641  if (Subtarget->hasFullFP16())
643  }
644 
645  // Basic strict FP operations are legal
648  for (MVT Ty : {MVT::f32, MVT::f64})
650  if (Subtarget->hasFullFP16())
652  }
653 
654  // Strict conversion to a larger type is legal
655  for (auto VT : {MVT::f32, MVT::f64})
657 
659 
662 
668 
669  // Generate outline atomics library calls only if LSE was not specified for
670  // subtarget
671  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
697 #define LCALLNAMES(A, B, N) \
698  setLibcallName(A##N##_RELAX, #B #N "_relax"); \
699  setLibcallName(A##N##_ACQ, #B #N "_acq"); \
700  setLibcallName(A##N##_REL, #B #N "_rel"); \
701  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
702 #define LCALLNAME4(A, B) \
703  LCALLNAMES(A, B, 1) \
704  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
705 #define LCALLNAME5(A, B) \
706  LCALLNAMES(A, B, 1) \
707  LCALLNAMES(A, B, 2) \
708  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
709  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
710  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
711  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
712  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
713  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
714  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
715 #undef LCALLNAMES
716 #undef LCALLNAME4
717 #undef LCALLNAME5
718  }
719 
720  // 128-bit loads and stores can be done without expanding
723 
724  // Aligned 128-bit loads and stores are single-copy atomic according to the
725  // v8.4a spec.
726  if (Subtarget->hasLSE2()) {
729  }
730 
731  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
732  // custom lowering, as there are no un-paired non-temporal stores and
733  // legalization will break up 256 bit inputs.
741 
742  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
743  // This requires the Performance Monitors extension.
744  if (Subtarget->hasPerfMon())
746 
747  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
748  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
749  // Issue __sincos_stret if available.
752  } else {
755  }
756 
757  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
758  // MSVCRT doesn't have powi; fall back to pow
759  setLibcallName(RTLIB::POWI_F32, nullptr);
760  setLibcallName(RTLIB::POWI_F64, nullptr);
761  }
762 
763  // Make floating-point constants legal for the large code model, so they don't
764  // become loads from the constant pool.
765  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
768  }
769 
770  // AArch64 does not have floating-point extending loads, i1 sign-extending
771  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
772  for (MVT VT : MVT::fp_valuetypes()) {
777  }
778  for (MVT VT : MVT::integer_valuetypes())
780 
788 
792 
793  // Indexed loads and stores are supported.
794  for (unsigned im = (unsigned)ISD::PRE_INC;
795  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
812  }
813 
814  // Trap.
818 
819  // We combine OR nodes for bitfield operations.
821  // Try to create BICs for vector ANDs.
823 
824  // Vector add and sub nodes may conceal a high-half opportunity.
825  // Also, try to fold ADD into CSINC/CSINV..
827  ISD::UINT_TO_FP});
828 
831 
832  // Try and combine setcc with csel
834 
836 
841  if (Subtarget->supportsAddressTopByteIgnored())
843 
845 
847 
851 
853 
855 
857 
858  // In case of strict alignment, avoid an excessive number of byte wide stores.
861  Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
862 
866  Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
867 
870 
873  Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
874 
876 
878 
879  EnableExtLdPromotion = true;
880 
881  // Set required alignment.
883  // Set preferred alignments.
887 
888  // Only change the limit for entries in a jump table if specified by
889  // the sub target, but not at the command line.
890  unsigned MaxJT = STI.getMaximumJumpTableSize();
891  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
893 
894  setHasExtractBitsInsn(true);
895 
897 
898  if (Subtarget->hasNEON()) {
899  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
900  // silliness like this:
901  for (auto Op :
917 
918  for (auto Op :
924 
925  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
926  // elements smaller than i32, so promote the input to i32 first.
929 
930  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
931  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
932  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
935  for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
937 
938  if (Subtarget->hasFullFP16()) {
947  } else {
948  // when AArch64 doesn't have fullfp16 support, promote the input
949  // to i32 first.
958  }
959 
968  for (auto VT : {MVT::v1i64, MVT::v2i64}) {
973  }
974 
975  // AArch64 doesn't have MUL.2d:
977  // Custom handling for some quad-vector types to detect MULL.
981 
982  // Saturates
983  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
989  }
990 
992  MVT::v4i32}) {
999  }
1000 
1001  // Vector reductions
1002  for (MVT VT : { MVT::v4f16, MVT::v2f32,
1004  if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1007 
1009  }
1010  }
1011  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1018  }
1020 
1023  // Likewise, narrowing and extending vector loads/stores aren't handled
1024  // directly.
1025  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1027 
1028  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1031  } else {
1034  }
1037 
1040 
1041  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1042  setTruncStoreAction(VT, InnerVT, Expand);
1043  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1044  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1045  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1046  }
1047  }
1048 
1049  // AArch64 has implementations of a lot of rounding-like FP operations.
1050  for (auto Op :
1055  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1056  setOperationAction(Op, Ty, Legal);
1057  if (Subtarget->hasFullFP16())
1058  for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1059  setOperationAction(Op, Ty, Legal);
1060  }
1061 
1063 
1070 
1071  // ADDP custom lowering
1072  for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1074  // FADDP custom lowering
1075  for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1077  }
1078 
1079  if (Subtarget->hasSME()) {
1081  }
1082 
1083  if (Subtarget->hasSVE()) {
1084  for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1125 
1131 
1140  }
1141 
1142  // Illegal unpacked integer vector types.
1143  for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1146  }
1147 
1148  // Legalize unpacked bitcasts to REINTERPRET_CAST.
1152 
1153  for (auto VT :
1157 
1158  for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1167 
1172 
1173  // There are no legal MVT::nxv16f## based types.
1174  if (VT != MVT::nxv16i1) {
1177  }
1178  }
1179 
1180  // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1188  }
1189 
1190  // Firstly, exclude all scalable vector extending loads/truncating stores,
1191  // include both integer and floating scalable vector.
1192  for (MVT VT : MVT::scalable_vector_valuetypes()) {
1193  for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1194  setTruncStoreAction(VT, InnerVT, Expand);
1195  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1196  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1197  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1198  }
1199  }
1200 
1201  // Then, selectively enable those which we directly support.
1208  for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1215  }
1216 
1217  // SVE supports truncating stores of 64 and 128-bit vectors
1223 
1260 
1273 
1285  }
1286 
1287  for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1294  }
1295 
1298 
1299  // NEON doesn't support integer divides, but SVE does
1300  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1304  }
1305 
1306  // NEON doesn't support 64-bit vector integer muls, but SVE does.
1309 
1310  // NOTE: Currently this has to happen after computeRegisterProperties rather
1311  // than the preferred option of combining it with the addRegisterClass call.
1312  if (Subtarget->useSVEForFixedLengthVectors()) {
1314  if (useSVEForFixedLengthVectorVT(VT))
1315  addTypeForFixedLengthSVE(VT);
1317  if (useSVEForFixedLengthVectorVT(VT))
1318  addTypeForFixedLengthSVE(VT);
1319 
1320  // 64bit results can mean a bigger than NEON input.
1321  for (auto VT : {MVT::v8i8, MVT::v4i16})
1324 
1325  // 128bit results imply a bigger than NEON input.
1326  for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1328  for (auto VT : {MVT::v8f16, MVT::v4f32})
1330 
1331  // These operations are not supported on NEON but SVE can do them.
1352 
1353  // Int operations with no NEON support.
1354  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1361  }
1362 
1363  // FP operations with no NEON support.
1364  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1367 
1368  // Use SVE for vectors with more than 2 elements.
1369  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1371  }
1372 
1377 
1379  }
1380 
1381  if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1382  // Only required for llvm.aarch64.mops.memset.tag
1384  }
1385 
1386  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1387 
1388  IsStrictFPEnabled = true;
1389 }
1390 
1391 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1392  assert(VT.isVector() && "VT should be a vector type");
1393 
1394  if (VT.isFloatingPoint()) {
1396  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1397  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1398  }
1399 
1400  // Mark vector float intrinsics as expand.
1401  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1410  }
1411 
1412  // But we do support custom-lowering for FCOPYSIGN.
1413  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1414  ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1416 
1428 
1432  for (MVT InnerVT : MVT::all_valuetypes())
1433  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1434 
1435  // CNT supports only B element sizes, then use UADDLP to widen.
1436  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1438 
1444 
1445  for (unsigned Opcode :
1448  setOperationAction(Opcode, VT, Custom);
1449 
1450  if (!VT.isFloatingPoint())
1452 
1453  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1454  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1455  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1456  setOperationAction(Opcode, VT, Legal);
1457 
1458  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1459  // NEON types.
1460  if (VT.isFloatingPoint() &&
1461  VT.getVectorElementType() != MVT::bf16 &&
1462  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1463  for (unsigned Opcode :
1469  setOperationAction(Opcode, VT, Legal);
1470 
1471  // Strict fp extend and trunc are legal
1472  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1474  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1476 
1477  // FIXME: We could potentially make use of the vector comparison instructions
1478  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1479  // complications:
1480  // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1481  // so we would need to expand when the condition code doesn't match the
1482  // kind of comparison.
1483  // * Some kinds of comparison require more than one FCMXY instruction so
1484  // would need to be expanded instead.
1485  // * The lowering of the non-strict versions involves target-specific ISD
1486  // nodes so we would likely need to add strict versions of all of them and
1487  // handle them appropriately.
1490 
1491  if (Subtarget->isLittleEndian()) {
1492  for (unsigned im = (unsigned)ISD::PRE_INC;
1493  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1496  }
1497  }
1498 }
1499 
1501  EVT OpVT) const {
1502  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1503  if (!Subtarget->hasSVE())
1504  return true;
1505 
1506  // We can only support legal predicate result types. We can use the SVE
1507  // whilelo instruction for generating fixed-width predicates too.
1508  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1509  ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1510  ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1511  return true;
1512 
1513  // The whilelo instruction only works with i32 or i64 scalar inputs.
1514  if (OpVT != MVT::i32 && OpVT != MVT::i64)
1515  return true;
1516 
1517  return false;
1518 }
1519 
1520 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1521  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1522 
1523  // By default everything must be expanded.
1524  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1526 
1527  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1529 
1530  if (VT.isFloatingPoint()) {
1542  }
1543 
1544  // Mark integer truncating stores/extending loads as having custom lowering
1545  if (VT.isInteger()) {
1546  MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1547  while (InnerVT != VT) {
1548  setTruncStoreAction(VT, InnerVT, Custom);
1549  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1550  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1551  InnerVT = InnerVT.changeVectorElementType(
1552  MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1553  }
1554  }
1555 
1556  // Mark floating-point truncating stores/extending loads as having custom
1557  // lowering
1558  if (VT.isFloatingPoint()) {
1559  MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1560  while (InnerVT != VT) {
1561  setTruncStoreAction(VT, InnerVT, Custom);
1562  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1563  InnerVT = InnerVT.changeVectorElementType(
1565  }
1566  }
1567 
1568  // Lower fixed length vector operations to scalable equivalents.
1650 }
1651 
1652 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1653  addRegisterClass(VT, &AArch64::FPR64RegClass);
1654  addTypeForNEON(VT);
1655 }
1656 
1657 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1658  addRegisterClass(VT, &AArch64::FPR128RegClass);
1659  addTypeForNEON(VT);
1660 }
1661 
1663  LLVMContext &C, EVT VT) const {
1664  if (!VT.isVector())
1665  return MVT::i32;
1666  if (VT.isScalableVector())
1669 }
1670 
1671 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1672  const APInt &Demanded,
1674  unsigned NewOpc) {
1675  uint64_t OldImm = Imm, NewImm, Enc;
1676  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1677 
1678  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1679  // bimm64.
1680  if (Imm == 0 || Imm == Mask ||
1682  return false;
1683 
1684  unsigned EltSize = Size;
1685  uint64_t DemandedBits = Demanded.getZExtValue();
1686 
1687  // Clear bits that are not demanded.
1688  Imm &= DemandedBits;
1689 
1690  while (true) {
1691  // The goal here is to set the non-demanded bits in a way that minimizes
1692  // the number of switching between 0 and 1. In order to achieve this goal,
1693  // we set the non-demanded bits to the value of the preceding demanded bits.
1694  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1695  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1696  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1697  // The final result is 0b11000011.
1698  uint64_t NonDemandedBits = ~DemandedBits;
1699  uint64_t InvertedImm = ~Imm & DemandedBits;
1700  uint64_t RotatedImm =
1701  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1702  NonDemandedBits;
1703  uint64_t Sum = RotatedImm + NonDemandedBits;
1704  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1705  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1706  NewImm = (Imm | Ones) & Mask;
1707 
1708  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1709  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1710  // we halve the element size and continue the search.
1711  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1712  break;
1713 
1714  // We cannot shrink the element size any further if it is 2-bits.
1715  if (EltSize == 2)
1716  return false;
1717 
1718  EltSize /= 2;
1719  Mask >>= EltSize;
1720  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1721 
1722  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1723  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1724  return false;
1725 
1726  // Merge the upper and lower halves of Imm and DemandedBits.
1727  Imm |= Hi;
1728  DemandedBits |= DemandedBitsHi;
1729  }
1730 
1731  ++NumOptimizedImms;
1732 
1733  // Replicate the element across the register width.
1734  while (EltSize < Size) {
1735  NewImm |= NewImm << EltSize;
1736  EltSize *= 2;
1737  }
1738 
1739  (void)OldImm;
1740  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1741  "demanded bits should never be altered");
1742  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1743 
1744  // Create the new constant immediate node.
1745  EVT VT = Op.getValueType();
1746  SDLoc DL(Op);
1747  SDValue New;
1748 
1749  // If the new constant immediate is all-zeros or all-ones, let the target
1750  // independent DAG combine optimize this node.
1751  if (NewImm == 0 || NewImm == OrigMask) {
1752  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1753  TLO.DAG.getConstant(NewImm, DL, VT));
1754  // Otherwise, create a machine node so that target independent DAG combine
1755  // doesn't undo this optimization.
1756  } else {
1757  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1758  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1759  New = SDValue(
1760  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1761  }
1762 
1763  return TLO.CombineTo(Op, New);
1764 }
1765 
1767  SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1768  TargetLoweringOpt &TLO) const {
1769  // Delay this optimization to as late as possible.
1770  if (!TLO.LegalOps)
1771  return false;
1772 
1774  return false;
1775 
1776  EVT VT = Op.getValueType();
1777  if (VT.isVector())
1778  return false;
1779 
1780  unsigned Size = VT.getSizeInBits();
1781  assert((Size == 32 || Size == 64) &&
1782  "i32 or i64 is expected after legalization.");
1783 
1784  // Exit early if we demand all bits.
1785  if (DemandedBits.countPopulation() == Size)
1786  return false;
1787 
1788  unsigned NewOpc;
1789  switch (Op.getOpcode()) {
1790  default:
1791  return false;
1792  case ISD::AND:
1793  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1794  break;
1795  case ISD::OR:
1796  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1797  break;
1798  case ISD::XOR:
1799  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1800  break;
1801  }
1802  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1803  if (!C)
1804  return false;
1805  uint64_t Imm = C->getZExtValue();
1806  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1807 }
1808 
1809 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1810 /// Mask are known to be either zero or one and return them Known.
1812  const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
1813  const SelectionDAG &DAG, unsigned Depth) const {
1814  switch (Op.getOpcode()) {
1815  default:
1816  break;
1817  case AArch64ISD::DUP: {
1818  SDValue SrcOp = Op.getOperand(0);
1819  Known = DAG.computeKnownBits(SrcOp, Depth + 1);
1820  if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
1821  assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
1822  "Expected DUP implicit truncation");
1823  Known = Known.trunc(Op.getScalarValueSizeInBits());
1824  }
1825  break;
1826  }
1827  case AArch64ISD::CSEL: {
1828  KnownBits Known2;
1829  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1830  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1831  Known = KnownBits::commonBits(Known, Known2);
1832  break;
1833  }
1834  case AArch64ISD::BICi: {
1835  // Compute the bit cleared value.
1836  uint64_t Mask =
1837  ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
1838  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1839  Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
1840  break;
1841  }
1842  case AArch64ISD::VLSHR: {
1843  KnownBits Known2;
1844  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1845  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1846  Known = KnownBits::lshr(Known, Known2);
1847  break;
1848  }
1849  case AArch64ISD::VASHR: {
1850  KnownBits Known2;
1851  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1852  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1853  Known = KnownBits::ashr(Known, Known2);
1854  break;
1855  }
1856  case AArch64ISD::LOADgot:
1857  case AArch64ISD::ADDlow: {
1858  if (!Subtarget->isTargetILP32())
1859  break;
1860  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1861  Known.Zero = APInt::getHighBitsSet(64, 32);
1862  break;
1863  }
1865  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1866  Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1867  break;
1868  }
1869  case ISD::INTRINSIC_W_CHAIN: {
1870  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1871  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1872  switch (IntID) {
1873  default: return;
1874  case Intrinsic::aarch64_ldaxr:
1875  case Intrinsic::aarch64_ldxr: {
1876  unsigned BitWidth = Known.getBitWidth();
1877  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1878  unsigned MemBits = VT.getScalarSizeInBits();
1879  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1880  return;
1881  }
1882  }
1883  break;
1884  }
1886  case ISD::INTRINSIC_VOID: {
1887  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1888  switch (IntNo) {
1889  default:
1890  break;
1891  case Intrinsic::aarch64_neon_umaxv:
1892  case Intrinsic::aarch64_neon_uminv: {
1893  // Figure out the datatype of the vector operand. The UMINV instruction
1894  // will zero extend the result, so we can mark as known zero all the
1895  // bits larger than the element datatype. 32-bit or larget doesn't need
1896  // this as those are legal types and will be handled by isel directly.
1897  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1898  unsigned BitWidth = Known.getBitWidth();
1899  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1900  assert(BitWidth >= 8 && "Unexpected width!");
1902  Known.Zero |= Mask;
1903  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1904  assert(BitWidth >= 16 && "Unexpected width!");
1906  Known.Zero |= Mask;
1907  }
1908  break;
1909  } break;
1910  }
1911  }
1912  }
1913 }
1914 
1916  EVT) const {
1917  return MVT::i64;
1918 }
1919 
1921  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1922  bool *Fast) const {
1923  if (Subtarget->requiresStrictAlign())
1924  return false;
1925 
1926  if (Fast) {
1927  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1928  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1929  // See comments in performSTORECombine() for more details about
1930  // these conditions.
1931 
1932  // Code that uses clang vector extensions can mark that it
1933  // wants unaligned accesses to be treated as fast by
1934  // underspecifying alignment to be 1 or 2.
1935  Alignment <= 2 ||
1936 
1937  // Disregard v2i64. Memcpy lowering produces those and splitting
1938  // them regresses performance on micro-benchmarks and olden/bh.
1939  VT == MVT::v2i64;
1940  }
1941  return true;
1942 }
1943 
1944 // Same as above but handling LLTs instead.
1946  LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1947  bool *Fast) const {
1948  if (Subtarget->requiresStrictAlign())
1949  return false;
1950 
1951  if (Fast) {
1952  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1953  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1954  Ty.getSizeInBytes() != 16 ||
1955  // See comments in performSTORECombine() for more details about
1956  // these conditions.
1957 
1958  // Code that uses clang vector extensions can mark that it
1959  // wants unaligned accesses to be treated as fast by
1960  // underspecifying alignment to be 1 or 2.
1961  Alignment <= 2 ||
1962 
1963  // Disregard v2i64. Memcpy lowering produces those and splitting
1964  // them regresses performance on micro-benchmarks and olden/bh.
1965  Ty == LLT::fixed_vector(2, 64);
1966  }
1967  return true;
1968 }
1969 
1970 FastISel *
1972  const TargetLibraryInfo *libInfo) const {
1973  return AArch64::createFastISel(funcInfo, libInfo);
1974 }
1975 
1976 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1977 #define MAKE_CASE(V) \
1978  case V: \
1979  return #V;
1980  switch ((AArch64ISD::NodeType)Opcode) {
1982  break;
2268  }
2269 #undef MAKE_CASE
2270  return nullptr;
2271 }
2272 
2275  MachineBasicBlock *MBB) const {
2276  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2277  // phi node:
2278 
2279  // OrigBB:
2280  // [... previous instrs leading to comparison ...]
2281  // b.ne TrueBB
2282  // b EndBB
2283  // TrueBB:
2284  // ; Fallthrough
2285  // EndBB:
2286  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2287 
2288  MachineFunction *MF = MBB->getParent();
2289  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2290  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2291  DebugLoc DL = MI.getDebugLoc();
2293 
2294  Register DestReg = MI.getOperand(0).getReg();
2295  Register IfTrueReg = MI.getOperand(1).getReg();
2296  Register IfFalseReg = MI.getOperand(2).getReg();
2297  unsigned CondCode = MI.getOperand(3).getImm();
2298  bool NZCVKilled = MI.getOperand(4).isKill();
2299 
2300  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2301  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2302  MF->insert(It, TrueBB);
2303  MF->insert(It, EndBB);
2304 
2305  // Transfer rest of current basic-block to EndBB
2306  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2307  MBB->end());
2309 
2310  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2311  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2312  MBB->addSuccessor(TrueBB);
2313  MBB->addSuccessor(EndBB);
2314 
2315  // TrueBB falls through to the end.
2316  TrueBB->addSuccessor(EndBB);
2317 
2318  if (!NZCVKilled) {
2319  TrueBB->addLiveIn(AArch64::NZCV);
2320  EndBB->addLiveIn(AArch64::NZCV);
2321  }
2322 
2323  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2324  .addReg(IfTrueReg)
2325  .addMBB(TrueBB)
2326  .addReg(IfFalseReg)
2327  .addMBB(MBB);
2328 
2329  MI.eraseFromParent();
2330  return EndBB;
2331 }
2332 
2334  MachineInstr &MI, MachineBasicBlock *BB) const {
2336  BB->getParent()->getFunction().getPersonalityFn())) &&
2337  "SEH does not use catchret!");
2338  return BB;
2339 }
2340 
2342 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2343  MachineInstr &MI,
2344  MachineBasicBlock *BB) const {
2345  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2346  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2347 
2348  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2349  MIB.add(MI.getOperand(1)); // slice index register
2350  MIB.add(MI.getOperand(2)); // slice index offset
2351  MIB.add(MI.getOperand(3)); // pg
2352  MIB.add(MI.getOperand(4)); // base
2353  MIB.add(MI.getOperand(5)); // offset
2354 
2355  MI.eraseFromParent(); // The pseudo is gone now.
2356  return BB;
2357 }
2358 
2361  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2362  MachineInstrBuilder MIB =
2363  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2364 
2365  MIB.addReg(AArch64::ZA, RegState::Define);
2366  MIB.add(MI.getOperand(0)); // Vector select register
2367  MIB.add(MI.getOperand(1)); // Vector select offset
2368  MIB.add(MI.getOperand(2)); // Base
2369  MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2370 
2371  MI.eraseFromParent(); // The pseudo is gone now.
2372  return BB;
2373 }
2374 
2376 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
2377  MachineInstr &MI,
2378  MachineBasicBlock *BB) const {
2379  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2380  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2381 
2382  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2383  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2384  MIB.add(MI.getOperand(1)); // Slice index register
2385  MIB.add(MI.getOperand(2)); // Slice index offset
2386  MIB.add(MI.getOperand(3)); // pg
2387  MIB.add(MI.getOperand(4)); // zn
2388 
2389  MI.eraseFromParent(); // The pseudo is gone now.
2390  return BB;
2391 }
2392 
2395  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2396  MachineInstrBuilder MIB =
2397  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2398  MIB.add(MI.getOperand(0)); // Mask
2399 
2400  unsigned Mask = MI.getOperand(0).getImm();
2401  for (unsigned I = 0; I < 8; I++) {
2402  if (Mask & (1 << I))
2403  MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2404  }
2405 
2406  MI.eraseFromParent(); // The pseudo is gone now.
2407  return BB;
2408 }
2409 
2411  MachineInstr &MI, MachineBasicBlock *BB) const {
2412  switch (MI.getOpcode()) {
2413  default:
2414 #ifndef NDEBUG
2415  MI.dump();
2416 #endif
2417  llvm_unreachable("Unexpected instruction for custom inserter!");
2418 
2419  case AArch64::F128CSEL:
2420  return EmitF128CSEL(MI, BB);
2421 
2422  case TargetOpcode::STATEPOINT:
2423  // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2424  // while bl call instruction (where statepoint will be lowered at the end)
2425  // has implicit def. This def is early-clobber as it will be set at
2426  // the moment of the call and earlier than any use is read.
2427  // Add this implicit dead def here as a workaround.
2428  MI.addOperand(*MI.getMF(),
2430  AArch64::LR, /*isDef*/ true,
2431  /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2432  /*isUndef*/ false, /*isEarlyClobber*/ true));
2434  case TargetOpcode::STACKMAP:
2435  case TargetOpcode::PATCHPOINT:
2436  return emitPatchPoint(MI, BB);
2437 
2438  case AArch64::CATCHRET:
2439  return EmitLoweredCatchRet(MI, BB);
2440  case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2441  return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2442  case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2443  return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2444  case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2445  return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2446  case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2447  return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2448  case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2449  return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2450  case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2451  return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2452  case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2453  return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2454  case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2455  return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2456  case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2457  return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2458  case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2459  return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2460  case AArch64::LDR_ZA_PSEUDO:
2461  return EmitFill(MI, BB);
2462  case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
2463  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
2464  BB);
2465  case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
2466  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
2467  BB);
2468  case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
2469  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
2470  BB);
2471  case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
2472  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
2473  BB);
2474  case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
2475  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
2476  BB);
2477  case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
2478  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
2479  BB);
2480  case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
2481  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
2482  BB);
2483  case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
2484  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
2485  BB);
2486  case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
2487  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
2488  BB);
2489  case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
2490  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
2491  BB);
2492  case AArch64::ZERO_M_PSEUDO:
2493  return EmitZero(MI, BB);
2494  }
2495 }
2496 
2497 //===----------------------------------------------------------------------===//
2498 // AArch64 Lowering private implementation.
2499 //===----------------------------------------------------------------------===//
2500 
2501 //===----------------------------------------------------------------------===//
2502 // Lowering Code
2503 //===----------------------------------------------------------------------===//
2504 
2505 // Forward declarations of SVE fixed length lowering helpers
2510  SelectionDAG &DAG);
2512  EVT VT);
2513 
2514 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2515 static bool isZerosVector(const SDNode *N) {
2516  // Look through a bit convert.
2517  while (N->getOpcode() == ISD::BITCAST)
2518  N = N->getOperand(0).getNode();
2519 
2521  return true;
2522 
2523  if (N->getOpcode() != AArch64ISD::DUP)
2524  return false;
2525 
2526  auto Opnd0 = N->getOperand(0);
2527  auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2528  auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2529  return (CINT && CINT->isZero()) || (CFP && CFP->isZero());
2530 }
2531 
2532 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2533 /// CC
2535  switch (CC) {
2536  default:
2537  llvm_unreachable("Unknown condition code!");
2538  case ISD::SETNE:
2539  return AArch64CC::NE;
2540  case ISD::SETEQ:
2541  return AArch64CC::EQ;
2542  case ISD::SETGT:
2543  return AArch64CC::GT;
2544  case ISD::SETGE:
2545  return AArch64CC::GE;
2546  case ISD::SETLT:
2547  return AArch64CC::LT;
2548  case ISD::SETLE:
2549  return AArch64CC::LE;
2550  case ISD::SETUGT:
2551  return AArch64CC::HI;
2552  case ISD::SETUGE:
2553  return AArch64CC::HS;
2554  case ISD::SETULT:
2555  return AArch64CC::LO;
2556  case ISD::SETULE:
2557  return AArch64CC::LS;
2558  }
2559 }
2560 
2561 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2564  AArch64CC::CondCode &CondCode2) {
2565  CondCode2 = AArch64CC::AL;
2566  switch (CC) {
2567  default:
2568  llvm_unreachable("Unknown FP condition!");
2569  case ISD::SETEQ:
2570  case ISD::SETOEQ:
2572  break;
2573  case ISD::SETGT:
2574  case ISD::SETOGT:
2576  break;
2577  case ISD::SETGE:
2578  case ISD::SETOGE:
2580  break;
2581  case ISD::SETOLT:
2583  break;
2584  case ISD::SETOLE:
2586  break;
2587  case ISD::SETONE:
2589  CondCode2 = AArch64CC::GT;
2590  break;
2591  case ISD::SETO:
2593  break;
2594  case ISD::SETUO:
2596  break;
2597  case ISD::SETUEQ:
2599  CondCode2 = AArch64CC::VS;
2600  break;
2601  case ISD::SETUGT:
2603  break;
2604  case ISD::SETUGE:
2606  break;
2607  case ISD::SETLT:
2608  case ISD::SETULT:
2610  break;
2611  case ISD::SETLE:
2612  case ISD::SETULE:
2614  break;
2615  case ISD::SETNE:
2616  case ISD::SETUNE:
2618  break;
2619  }
2620 }
2621 
2622 /// Convert a DAG fp condition code to an AArch64 CC.
2623 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2624 /// should be AND'ed instead of OR'ed.
2627  AArch64CC::CondCode &CondCode2) {
2628  CondCode2 = AArch64CC::AL;
2629  switch (CC) {
2630  default:
2631  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2632  assert(CondCode2 == AArch64CC::AL);
2633  break;
2634  case ISD::SETONE:
2635  // (a one b)
2636  // == ((a olt b) || (a ogt b))
2637  // == ((a ord b) && (a une b))
2639  CondCode2 = AArch64CC::NE;
2640  break;
2641  case ISD::SETUEQ:
2642  // (a ueq b)
2643  // == ((a uno b) || (a oeq b))
2644  // == ((a ule b) && (a uge b))
2646  CondCode2 = AArch64CC::LE;
2647  break;
2648  }
2649 }
2650 
2651 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2652 /// CC usable with the vector instructions. Fewer operations are available
2653 /// without a real NZCV register, so we have to use less efficient combinations
2654 /// to get the same effect.
2657  AArch64CC::CondCode &CondCode2,
2658  bool &Invert) {
2659  Invert = false;
2660  switch (CC) {
2661  default:
2662  // Mostly the scalar mappings work fine.
2663  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2664  break;
2665  case ISD::SETUO:
2666  Invert = true;
2668  case ISD::SETO:
2670  CondCode2 = AArch64CC::GE;
2671  break;
2672  case ISD::SETUEQ:
2673  case ISD::SETULT:
2674  case ISD::SETULE:
2675  case ISD::SETUGT:
2676  case ISD::SETUGE:
2677  // All of the compare-mask comparisons are ordered, but we can switch
2678  // between the two by a double inversion. E.g. ULE == !OGT.
2679  Invert = true;
2680  changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2681  CondCode, CondCode2);
2682  break;
2683  }
2684 }
2685 
2687  // Matches AArch64DAGToDAGISel::SelectArithImmed().
2688  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2689  LLVM_DEBUG(dbgs() << "Is imm " << C
2690  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2691  return IsLegal;
2692 }
2693 
2694 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2695 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2696 // can be set differently by this operation. It comes down to whether
2697 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2698 // everything is fine. If not then the optimization is wrong. Thus general
2699 // comparisons are only valid if op2 != 0.
2700 //
2701 // So, finally, the only LLVM-native comparisons that don't mention C and V
2702 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2703 // the absence of information about op2.
2704 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2705  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2706  (CC == ISD::SETEQ || CC == ISD::SETNE);
2707 }
2708 
2710  SelectionDAG &DAG, SDValue Chain,
2711  bool IsSignaling) {
2712  EVT VT = LHS.getValueType();
2713  assert(VT != MVT::f128);
2714 
2715  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2716 
2717  if (VT == MVT::f16 && !FullFP16) {
2719  {Chain, LHS});
2721  {LHS.getValue(1), RHS});
2722  Chain = RHS.getValue(1);
2723  VT = MVT::f32;
2724  }
2725  unsigned Opcode =
2727  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2728 }
2729 
2731  const SDLoc &dl, SelectionDAG &DAG) {
2732  EVT VT = LHS.getValueType();
2733  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2734 
2735  if (VT.isFloatingPoint()) {
2736  assert(VT != MVT::f128);
2737  if (VT == MVT::f16 && !FullFP16) {
2738  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2739  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2740  VT = MVT::f32;
2741  }
2742  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2743  }
2744 
2745  // The CMP instruction is just an alias for SUBS, and representing it as
2746  // SUBS means that it's possible to get CSE with subtract operations.
2747  // A later phase can perform the optimization of setting the destination
2748  // register to WZR/XZR if it ends up being unused.
2749  unsigned Opcode = AArch64ISD::SUBS;
2750 
2751  if (isCMN(RHS, CC)) {
2752  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2753  Opcode = AArch64ISD::ADDS;
2754  RHS = RHS.getOperand(1);
2755  } else if (isCMN(LHS, CC)) {
2756  // As we are looking for EQ/NE compares, the operands can be commuted ; can
2757  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2758  Opcode = AArch64ISD::ADDS;
2759  LHS = LHS.getOperand(1);
2760  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2761  if (LHS.getOpcode() == ISD::AND) {
2762  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2763  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2764  // of the signed comparisons.
2765  const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2766  DAG.getVTList(VT, MVT_CC),
2767  LHS.getOperand(0),
2768  LHS.getOperand(1));
2769  // Replace all users of (and X, Y) with newly generated (ands X, Y)
2770  DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2771  return ANDSNode.getValue(1);
2772  } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2773  // Use result of ANDS
2774  return LHS.getValue(1);
2775  }
2776  }
2777 
2778  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2779  .getValue(1);
2780 }
2781 
2782 /// \defgroup AArch64CCMP CMP;CCMP matching
2783 ///
2784 /// These functions deal with the formation of CMP;CCMP;... sequences.
2785 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2786 /// a comparison. They set the NZCV flags to a predefined value if their
2787 /// predicate is false. This allows to express arbitrary conjunctions, for
2788 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2789 /// expressed as:
2790 /// cmp A
2791 /// ccmp B, inv(CB), CA
2792 /// check for CB flags
2793 ///
2794 /// This naturally lets us implement chains of AND operations with SETCC
2795 /// operands. And we can even implement some other situations by transforming
2796 /// them:
2797 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2798 /// negating the flags used in a CCMP/FCCMP operations.
2799 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2800 /// by negating the flags we test for afterwards. i.e.
2801 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2802 /// - Note that we can only ever negate all previously processed results.
2803 /// What we can not implement by flipping the flags to test is a negation
2804 /// of two sub-trees (because the negation affects all sub-trees emitted so
2805 /// far, so the 2nd sub-tree we emit would also affect the first).
2806 /// With those tools we can implement some OR operations:
2807 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2808 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2809 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2810 /// elimination rules from earlier to implement the whole thing as a
2811 /// CCMP/FCCMP chain.
2812 ///
2813 /// As complete example:
2814 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2815 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2816 /// can be reassociated to:
2817 /// or (and (setCC (cmp C)) setCD (cmp D))
2818 // (or (setCA (cmp A)) (setCB (cmp B)))
2819 /// can be transformed to:
2820 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2821 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2822 /// which can be implemented as:
2823 /// cmp C
2824 /// ccmp D, inv(CD), CC
2825 /// ccmp A, CA, inv(CD)
2826 /// ccmp B, CB, inv(CA)
2827 /// check for CB flags
2828 ///
2829 /// A counterexample is "or (and A B) (and C D)" which translates to
2830 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2831 /// can only implement 1 of the inner (not) operations, but not both!
2832 /// @{
2833 
2834 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2836  ISD::CondCode CC, SDValue CCOp,
2838  AArch64CC::CondCode OutCC,
2839  const SDLoc &DL, SelectionDAG &DAG) {
2840  unsigned Opcode = 0;
2841  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2842 
2843  if (LHS.getValueType().isFloatingPoint()) {
2844  assert(LHS.getValueType() != MVT::f128);
2845  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2848  }
2849  Opcode = AArch64ISD::FCCMP;
2850  } else if (RHS.getOpcode() == ISD::SUB) {
2851  SDValue SubOp0 = RHS.getOperand(0);
2852  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2853  // See emitComparison() on why we can only do this for SETEQ and SETNE.
2854  Opcode = AArch64ISD::CCMN;
2855  RHS = RHS.getOperand(1);
2856  }
2857  }
2858  if (Opcode == 0)
2859  Opcode = AArch64ISD::CCMP;
2860 
2861  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2863  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2864  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2865  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2866 }
2867 
2868 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2869 /// expressed as a conjunction. See \ref AArch64CCMP.
2870 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
2871 /// changing the conditions on the SETCC tests.
2872 /// (this means we can call emitConjunctionRec() with
2873 /// Negate==true on this sub-tree)
2874 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
2875 /// cannot do the negation naturally. We are required to
2876 /// emit the subtree first in this case.
2877 /// \param WillNegate Is true if are called when the result of this
2878 /// subexpression must be negated. This happens when the
2879 /// outer expression is an OR. We can use this fact to know
2880 /// that we have a double negation (or (or ...) ...) that
2881 /// can be implemented for free.
2882 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2883  bool &MustBeFirst, bool WillNegate,
2884  unsigned Depth = 0) {
2885  if (!Val.hasOneUse())
2886  return false;
2887  unsigned Opcode = Val->getOpcode();
2888  if (Opcode == ISD::SETCC) {
2889  if (Val->getOperand(0).getValueType() == MVT::f128)
2890  return false;
2891  CanNegate = true;
2892  MustBeFirst = false;
2893  return true;
2894  }
2895  // Protect against exponential runtime and stack overflow.
2896  if (Depth > 6)
2897  return false;
2898  if (Opcode == ISD::AND || Opcode == ISD::OR) {
2899  bool IsOR = Opcode == ISD::OR;
2900  SDValue O0 = Val->getOperand(0);
2901  SDValue O1 = Val->getOperand(1);
2902  bool CanNegateL;
2903  bool MustBeFirstL;
2904  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2905  return false;
2906  bool CanNegateR;
2907  bool MustBeFirstR;
2908  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2909  return false;
2910 
2911  if (MustBeFirstL && MustBeFirstR)
2912  return false;
2913 
2914  if (IsOR) {
2915  // For an OR expression we need to be able to naturally negate at least
2916  // one side or we cannot do the transformation at all.
2917  if (!CanNegateL && !CanNegateR)
2918  return false;
2919  // If we the result of the OR will be negated and we can naturally negate
2920  // the leafs, then this sub-tree as a whole negates naturally.
2921  CanNegate = WillNegate && CanNegateL && CanNegateR;
2922  // If we cannot naturally negate the whole sub-tree, then this must be
2923  // emitted first.
2924  MustBeFirst = !CanNegate;
2925  } else {
2926  assert(Opcode == ISD::AND && "Must be OR or AND");
2927  // We cannot naturally negate an AND operation.
2928  CanNegate = false;
2929  MustBeFirst = MustBeFirstL || MustBeFirstR;
2930  }
2931  return true;
2932  }
2933  return false;
2934 }
2935 
2936 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2937 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2938 /// Tries to transform the given i1 producing node @p Val to a series compare
2939 /// and conditional compare operations. @returns an NZCV flags producing node
2940 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2941 /// transformation was not possible.
2942 /// \p Negate is true if we want this sub-tree being negated just by changing
2943 /// SETCC conditions.
2945  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2947  // We're at a tree leaf, produce a conditional comparison operation.
2948  unsigned Opcode = Val->getOpcode();
2949  if (Opcode == ISD::SETCC) {
2950  SDValue LHS = Val->getOperand(0);
2951  SDValue RHS = Val->getOperand(1);
2952  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2953  bool isInteger = LHS.getValueType().isInteger();
2954  if (Negate)
2955  CC = getSetCCInverse(CC, LHS.getValueType());
2956  SDLoc DL(Val);
2957  // Determine OutCC and handle FP special case.
2958  if (isInteger) {
2959  OutCC = changeIntCCToAArch64CC(CC);
2960  } else {
2961  assert(LHS.getValueType().isFloatingPoint());
2962  AArch64CC::CondCode ExtraCC;
2963  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2964  // Some floating point conditions can't be tested with a single condition
2965  // code. Construct an additional comparison in this case.
2966  if (ExtraCC != AArch64CC::AL) {
2967  SDValue ExtraCmp;
2968  if (!CCOp.getNode())
2969  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2970  else
2971  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2972  ExtraCC, DL, DAG);
2973  CCOp = ExtraCmp;
2974  Predicate = ExtraCC;
2975  }
2976  }
2977 
2978  // Produce a normal comparison if we are first in the chain
2979  if (!CCOp)
2980  return emitComparison(LHS, RHS, CC, DL, DAG);
2981  // Otherwise produce a ccmp.
2982  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2983  DAG);
2984  }
2985  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2986 
2987  bool IsOR = Opcode == ISD::OR;
2988 
2989  SDValue LHS = Val->getOperand(0);
2990  bool CanNegateL;
2991  bool MustBeFirstL;
2992  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2993  assert(ValidL && "Valid conjunction/disjunction tree");
2994  (void)ValidL;
2995 
2996  SDValue RHS = Val->getOperand(1);
2997  bool CanNegateR;
2998  bool MustBeFirstR;
2999  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3000  assert(ValidR && "Valid conjunction/disjunction tree");
3001  (void)ValidR;
3002 
3003  // Swap sub-tree that must come first to the right side.
3004  if (MustBeFirstL) {
3005  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3006  std::swap(LHS, RHS);
3007  std::swap(CanNegateL, CanNegateR);
3008  std::swap(MustBeFirstL, MustBeFirstR);
3009  }
3010 
3011  bool NegateR;
3012  bool NegateAfterR;
3013  bool NegateL;
3014  bool NegateAfterAll;
3015  if (Opcode == ISD::OR) {
3016  // Swap the sub-tree that we can negate naturally to the left.
3017  if (!CanNegateL) {
3018  assert(CanNegateR && "at least one side must be negatable");
3019  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3020  assert(!Negate);
3021  std::swap(LHS, RHS);
3022  NegateR = false;
3023  NegateAfterR = true;
3024  } else {
3025  // Negate the left sub-tree if possible, otherwise negate the result.
3026  NegateR = CanNegateR;
3027  NegateAfterR = !CanNegateR;
3028  }
3029  NegateL = true;
3030  NegateAfterAll = !Negate;
3031  } else {
3032  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3033  assert(!Negate && "Valid conjunction/disjunction tree");
3034 
3035  NegateL = false;
3036  NegateR = false;
3037  NegateAfterR = false;
3038  NegateAfterAll = false;
3039  }
3040 
3041  // Emit sub-trees.
3042  AArch64CC::CondCode RHSCC;
3043  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3044  if (NegateAfterR)
3045  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3046  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3047  if (NegateAfterAll)
3048  OutCC = AArch64CC::getInvertedCondCode(OutCC);
3049  return CmpL;
3050 }
3051 
3052 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3053 /// In some cases this is even possible with OR operations in the expression.
3054 /// See \ref AArch64CCMP.
3055 /// \see emitConjunctionRec().
3057  AArch64CC::CondCode &OutCC) {
3058  bool DummyCanNegate;
3059  bool DummyMustBeFirst;
3060  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3061  return SDValue();
3062 
3063  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3064 }
3065 
3066 /// @}
3067 
3068 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3069 /// extension operations.
3071  auto isSupportedExtend = [&](SDValue V) {
3072  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3073  return true;
3074 
3075  if (V.getOpcode() == ISD::AND)
3076  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3077  uint64_t Mask = MaskCst->getZExtValue();
3078  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3079  }
3080 
3081  return false;
3082  };
3083 
3084  if (!Op.hasOneUse())
3085  return 0;
3086 
3087  if (isSupportedExtend(Op))
3088  return 1;
3089 
3090  unsigned Opc = Op.getOpcode();
3091  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3092  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3093  uint64_t Shift = ShiftCst->getZExtValue();
3094  if (isSupportedExtend(Op.getOperand(0)))
3095  return (Shift <= 4) ? 2 : 1;
3096  EVT VT = Op.getValueType();
3097  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3098  return 1;
3099  }
3100 
3101  return 0;
3102 }
3103 
3105  SDValue &AArch64cc, SelectionDAG &DAG,
3106  const SDLoc &dl) {
3107  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3108  EVT VT = RHS.getValueType();
3109  uint64_t C = RHSC->getZExtValue();
3110  if (!isLegalArithImmed(C)) {
3111  // Constant does not fit, try adjusting it by one?
3112  switch (CC) {
3113  default:
3114  break;
3115  case ISD::SETLT:
3116  case ISD::SETGE:
3117  if ((VT == MVT::i32 && C != 0x80000000 &&
3118  isLegalArithImmed((uint32_t)(C - 1))) ||
3119  (VT == MVT::i64 && C != 0x80000000ULL &&
3120  isLegalArithImmed(C - 1ULL))) {
3121  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3122  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3123  RHS = DAG.getConstant(C, dl, VT);
3124  }
3125  break;
3126  case ISD::SETULT:
3127  case ISD::SETUGE:
3128  if ((VT == MVT::i32 && C != 0 &&
3129  isLegalArithImmed((uint32_t)(C - 1))) ||
3130  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3131  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3132  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3133  RHS = DAG.getConstant(C, dl, VT);
3134  }
3135  break;
3136  case ISD::SETLE:
3137  case ISD::SETGT:
3138  if ((VT == MVT::i32 && C != INT32_MAX &&
3139  isLegalArithImmed((uint32_t)(C + 1))) ||
3140  (VT == MVT::i64 && C != INT64_MAX &&
3141  isLegalArithImmed(C + 1ULL))) {
3142  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3143  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3144  RHS = DAG.getConstant(C, dl, VT);
3145  }
3146  break;
3147  case ISD::SETULE:
3148  case ISD::SETUGT:
3149  if ((VT == MVT::i32 && C != UINT32_MAX &&
3150  isLegalArithImmed((uint32_t)(C + 1))) ||
3151  (VT == MVT::i64 && C != UINT64_MAX &&
3152  isLegalArithImmed(C + 1ULL))) {
3153  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3154  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3155  RHS = DAG.getConstant(C, dl, VT);
3156  }
3157  break;
3158  }
3159  }
3160  }
3161 
3162  // Comparisons are canonicalized so that the RHS operand is simpler than the
3163  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3164  // can fold some shift+extend operations on the RHS operand, so swap the
3165  // operands if that can be done.
3166  //
3167  // For example:
3168  // lsl w13, w11, #1
3169  // cmp w13, w12
3170  // can be turned into:
3171  // cmp w12, w11, lsl #1
3172  if (!isa<ConstantSDNode>(RHS) ||
3173  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3174  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3175 
3177  std::swap(LHS, RHS);
3179  }
3180  }
3181 
3182  SDValue Cmp;
3183  AArch64CC::CondCode AArch64CC;
3184  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3185  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3186 
3187  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3188  // For the i8 operand, the largest immediate is 255, so this can be easily
3189  // encoded in the compare instruction. For the i16 operand, however, the
3190  // largest immediate cannot be encoded in the compare.
3191  // Therefore, use a sign extending load and cmn to avoid materializing the
3192  // -1 constant. For example,
3193  // movz w1, #65535
3194  // ldrh w0, [x0, #0]
3195  // cmp w0, w1
3196  // >
3197  // ldrsh w0, [x0, #0]
3198  // cmn w0, #1
3199  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3200  // if and only if (sext LHS) == (sext RHS). The checks are in place to
3201  // ensure both the LHS and RHS are truly zero extended and to make sure the
3202  // transformation is profitable.
3203  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3204  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3205  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3206  LHS.getNode()->hasNUsesOfValue(1, 0)) {
3207  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3208  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3209  SDValue SExt =
3210  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3211  DAG.getValueType(MVT::i16));
3212  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3213  RHS.getValueType()),
3214  CC, dl, DAG);
3215  AArch64CC = changeIntCCToAArch64CC(CC);
3216  }
3217  }
3218 
3219  if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3220  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3221  if ((CC == ISD::SETNE) ^ RHSC->isZero())
3222  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3223  }
3224  }
3225  }
3226 
3227  if (!Cmp) {
3228  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3229  AArch64CC = changeIntCCToAArch64CC(CC);
3230  }
3231  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3232  return Cmp;
3233 }
3234 
3235 static std::pair<SDValue, SDValue>
3237  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3238  "Unsupported value type");
3239  SDValue Value, Overflow;
3240  SDLoc DL(Op);
3241  SDValue LHS = Op.getOperand(0);
3242  SDValue RHS = Op.getOperand(1);
3243  unsigned Opc = 0;
3244  switch (Op.getOpcode()) {
3245  default:
3246  llvm_unreachable("Unknown overflow instruction!");
3247  case ISD::SADDO:
3248  Opc = AArch64ISD::ADDS;
3249  CC = AArch64CC::VS;
3250  break;
3251  case ISD::UADDO:
3252  Opc = AArch64ISD::ADDS;
3253  CC = AArch64CC::HS;
3254  break;
3255  case ISD::SSUBO:
3256  Opc = AArch64ISD::SUBS;
3257  CC = AArch64CC::VS;
3258  break;
3259  case ISD::USUBO:
3260  Opc = AArch64ISD::SUBS;
3261  CC = AArch64CC::LO;
3262  break;
3263  // Multiply needs a little bit extra work.
3264  case ISD::SMULO:
3265  case ISD::UMULO: {
3266  CC = AArch64CC::NE;
3267  bool IsSigned = Op.getOpcode() == ISD::SMULO;
3268  if (Op.getValueType() == MVT::i32) {
3269  // Extend to 64-bits, then perform a 64-bit multiply.
3270  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3271  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3272  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3275 
3276  // Check that the result fits into a 32-bit integer.
3277  SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3278  if (IsSigned) {
3279  // cmp xreg, wreg, sxtw
3280  SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3281  Overflow =
3282  DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3283  } else {
3284  // tst xreg, #0xffffffff00000000
3285  SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3286  Overflow =
3287  DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3288  }
3289  break;
3290  }
3291  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3292  // For the 64 bit multiply
3293  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3294  if (IsSigned) {
3295  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3296  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3297  DAG.getConstant(63, DL, MVT::i64));
3298  // It is important that LowerBits is last, otherwise the arithmetic
3299  // shift will not be folded into the compare (SUBS).
3300  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3301  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3302  .getValue(1);
3303  } else {
3304  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3305  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3306  Overflow =
3307  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3308  DAG.getConstant(0, DL, MVT::i64),
3309  UpperBits).getValue(1);
3310  }
3311  break;
3312  }
3313  } // switch (...)
3314 
3315  if (Opc) {
3316  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3317 
3318  // Emit the AArch64 operation with overflow check.
3319  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3320  Overflow = Value.getValue(1);
3321  }
3322  return std::make_pair(Value, Overflow);
3323 }
3324 
3325 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3326  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3327  return LowerToScalableOp(Op, DAG);
3328 
3329  SDValue Sel = Op.getOperand(0);
3330  SDValue Other = Op.getOperand(1);
3331  SDLoc dl(Sel);
3332 
3333  // If the operand is an overflow checking operation, invert the condition
3334  // code and kill the Not operation. I.e., transform:
3335  // (xor (overflow_op_bool, 1))
3336  // -->
3337  // (csel 1, 0, invert(cc), overflow_op_bool)
3338  // ... which later gets transformed to just a cset instruction with an
3339  // inverted condition code, rather than a cset + eor sequence.
3340  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3341  // Only lower legal XALUO ops.
3342  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3343  return SDValue();
3344 
3345  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3346  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3348  SDValue Value, Overflow;
3349  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3350  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3351  return DAG.