LLVM 20.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
82#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
588 if (Subtarget->hasFPARMv8()) {
591 }
594 if (Subtarget->hasFPARMv8()) {
597 }
600
605
606 // Variable arguments.
611
612 // Variable-sized objects.
615
616 // Lowering Funnel Shifts to EXTR
621
623
624 // Constant pool entries
626
627 // BlockAddress
629
630 // AArch64 lacks both left-rotate and popcount instructions.
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
647
648 if (Subtarget->hasCSSC()) {
652
654
658
661
666
671 } else {
675
678
681 }
682
688 }
695
696 // Custom lower Add/Sub/Mul with overflow.
709
718
727 if (Subtarget->hasFullFP16()) {
730 } else {
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
748 setOperationAction(Op, MVT::f16, Promote);
749 setOperationAction(Op, MVT::v4f16, Expand);
750 setOperationAction(Op, MVT::v8f16, Expand);
751 setOperationAction(Op, MVT::bf16, Promote);
752 setOperationAction(Op, MVT::v4bf16, Expand);
753 setOperationAction(Op, MVT::v8bf16, Expand);
754 }
755
756 // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
763
764 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
765 for (auto Op : {
769 ISD::FADD,
770 ISD::FSUB,
771 ISD::FMUL,
772 ISD::FDIV,
773 ISD::FMA,
804 })
805 setOperationAction(Op, ScalarVT, Promote);
806
807 for (auto Op : {ISD::FNEG, ISD::FABS})
808 setOperationAction(Op, ScalarVT, Legal);
809
810 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
811 // because the result type is integer.
815 setOperationAction(Op, ScalarVT, Custom);
816
817 // promote v4f16 to v4f32 when that is known to be safe.
818 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
819 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
820 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
821 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
822 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
823 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
824 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
825 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
826 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
827 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
828 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
829 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
830 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
831
841
842 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
864 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
865 };
866
867 if (!Subtarget->hasFullFP16()) {
868 LegalizeNarrowFP(MVT::f16);
869 }
870 LegalizeNarrowFP(MVT::bf16);
873
874 // AArch64 has implementations of a lot of rounding-like FP operations.
875 // clang-format off
876 for (auto Op :
888 for (MVT Ty : {MVT::f32, MVT::f64})
890 if (Subtarget->hasFullFP16())
891 setOperationAction(Op, MVT::f16, Legal);
892 }
893 // clang-format on
894
895 // Basic strict FP operations are legal
898 for (MVT Ty : {MVT::f32, MVT::f64})
900 if (Subtarget->hasFullFP16())
901 setOperationAction(Op, MVT::f16, Legal);
902 }
903
905
911
913 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
916 } else {
919 }
922
923 // Generate outline atomics library calls only if LSE was not specified for
924 // subtarget
925 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
951#define LCALLNAMES(A, B, N) \
952 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
953 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
954 setLibcallName(A##N##_REL, #B #N "_rel"); \
955 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
956#define LCALLNAME4(A, B) \
957 LCALLNAMES(A, B, 1) \
958 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
959#define LCALLNAME5(A, B) \
960 LCALLNAMES(A, B, 1) \
961 LCALLNAMES(A, B, 2) \
962 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
963 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
964 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
965 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
966 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
967 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
968 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
969#undef LCALLNAMES
970#undef LCALLNAME4
971#undef LCALLNAME5
972 }
973
974 if (Subtarget->hasLSE128()) {
975 // Custom lowering because i128 is not legal. Must be replaced by 2x64
976 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
980 }
981
982 // 128-bit loads and stores can be done without expanding
985
986 // Aligned 128-bit loads and stores are single-copy atomic according to the
987 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
988 if (Subtarget->hasLSE2()) {
991 }
992
993 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
994 // custom lowering, as there are no un-paired non-temporal stores and
995 // legalization will break up 256 bit inputs.
997 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
998 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
999 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1000 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1001 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1002 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1003 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1004
1005 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1006 // custom lowering, as there are no un-paired non-temporal loads legalization
1007 // will break up 256 bit inputs.
1008 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1009 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1010 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1011 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1012 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1013 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1014 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1015 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1016
1017 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1019
1020 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1021 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1022 // Issue __sincos_stret if available.
1025 } else {
1028 }
1029
1030 // Make floating-point constants legal for the large code model, so they don't
1031 // become loads from the constant pool.
1032 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1035 }
1036
1037 // AArch64 does not have floating-point extending loads, i1 sign-extending
1038 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1039 for (MVT VT : MVT::fp_valuetypes()) {
1040 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1041 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1042 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1043 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1044 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1045 }
1046 for (MVT VT : MVT::integer_valuetypes())
1047 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1048
1049 for (MVT WideVT : MVT::fp_valuetypes()) {
1050 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1051 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1052 setTruncStoreAction(WideVT, NarrowVT, Expand);
1053 }
1054 }
1055 }
1056
1057 if (Subtarget->hasFPARMv8()) {
1061 }
1062
1063 // Indexed loads and stores are supported.
1064 for (unsigned im = (unsigned)ISD::PRE_INC;
1066 setIndexedLoadAction(im, MVT::i8, Legal);
1067 setIndexedLoadAction(im, MVT::i16, Legal);
1068 setIndexedLoadAction(im, MVT::i32, Legal);
1069 setIndexedLoadAction(im, MVT::i64, Legal);
1070 setIndexedLoadAction(im, MVT::f64, Legal);
1071 setIndexedLoadAction(im, MVT::f32, Legal);
1072 setIndexedLoadAction(im, MVT::f16, Legal);
1073 setIndexedLoadAction(im, MVT::bf16, Legal);
1074 setIndexedStoreAction(im, MVT::i8, Legal);
1075 setIndexedStoreAction(im, MVT::i16, Legal);
1076 setIndexedStoreAction(im, MVT::i32, Legal);
1077 setIndexedStoreAction(im, MVT::i64, Legal);
1078 setIndexedStoreAction(im, MVT::f64, Legal);
1079 setIndexedStoreAction(im, MVT::f32, Legal);
1080 setIndexedStoreAction(im, MVT::f16, Legal);
1081 setIndexedStoreAction(im, MVT::bf16, Legal);
1082 }
1083
1084 // Trap.
1085 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1088
1089 // We combine OR nodes for bitfield operations.
1091 // Try to create BICs for vector ANDs.
1093
1094 // llvm.init.trampoline and llvm.adjust.trampoline
1097
1098 // Vector add and sub nodes may conceal a high-half opportunity.
1099 // Also, try to fold ADD into CSINC/CSINV..
1102
1105
1106 // Try and combine setcc with csel
1108
1110
1117
1119
1121
1123
1127
1130
1132
1134
1136
1140
1142
1144
1145 // In case of strict alignment, avoid an excessive number of byte wide stores.
1148 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1149
1153 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1154
1157 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1158
1161 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1162
1164
1166
1167 EnableExtLdPromotion = true;
1168
1169 // Set required alignment.
1171 // Set preferred alignments.
1172
1173 // Don't align loops on Windows. The SEH unwind info generation needs to
1174 // know the exact length of functions before the alignments have been
1175 // expanded.
1176 if (!Subtarget->isTargetWindows())
1180
1181 // Only change the limit for entries in a jump table if specified by
1182 // the sub target, but not at the command line.
1183 unsigned MaxJT = STI.getMaximumJumpTableSize();
1184 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1186
1188
1190
1192 if (Subtarget->hasSME())
1194
1195 if (Subtarget->isNeonAvailable()) {
1196 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1197 // silliness like this:
1198 // clang-format off
1199 for (auto Op :
1220 setOperationAction(Op, MVT::v1f64, Expand);
1221 // clang-format on
1222
1223 for (auto Op :
1228 setOperationAction(Op, MVT::v1i64, Expand);
1229
1230 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1231 // elements smaller than i32, so promote the input to i32 first.
1232 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1233 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1234
1235 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1236 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1237 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1240 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1242
1243 if (Subtarget->hasFullFP16()) {
1246
1255 } else {
1256 // when AArch64 doesn't have fullfp16 support, promote the input
1257 // to i32 first.
1258 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1259 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1260 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1261 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1262 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1263 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1264 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1265 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1266 }
1267
1268 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1269 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1276 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1281 }
1282
1283 // Custom handling for some quad-vector types to detect MULL.
1284 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1285 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1286 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1287 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1288 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1289 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1290
1291 // Saturates
1292 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1293 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1298 }
1299
1300 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1301 MVT::v4i32}) {
1308 }
1309
1310 // Vector reductions
1311 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1312 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1313 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1318
1320 }
1321 }
1322 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1323 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1332 }
1337
1339 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1340 // Likewise, narrowing and extending vector loads/stores aren't handled
1341 // directly.
1344
1345 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1348 } else {
1351 }
1354
1357
1358 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1359 setTruncStoreAction(VT, InnerVT, Expand);
1360 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1361 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1362 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1363 }
1364 }
1365
1366 for (auto Op :
1372 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1374 if (Subtarget->hasFullFP16())
1375 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1377 }
1378
1379 // LRINT and LLRINT.
1380 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1381 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1383 if (Subtarget->hasFullFP16())
1384 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1386 }
1387
1388 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1389
1394
1398
1399 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1400 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1401 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1402 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1403 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1404 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1405
1406 // ADDP custom lowering
1407 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1409 // FADDP custom lowering
1410 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1412 } else /* !isNeonAvailable */ {
1414 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1416
1417 if (VT.is128BitVector() || VT.is64BitVector()) {
1421 Subtarget->isLittleEndian() ? Legal : Expand);
1422 }
1423 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1424 setTruncStoreAction(VT, InnerVT, Expand);
1425 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1426 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1427 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1428 }
1429 }
1430 }
1431
1432 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1436 }
1437
1438 if (Subtarget->hasSME()) {
1440 }
1441
1442 // FIXME: Move lowering for more nodes here if those are common between
1443 // SVE and SME.
1444 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1445 for (auto VT :
1446 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1451 }
1452 }
1453
1454 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1455 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1496
1502
1511
1516
1517 if (!Subtarget->isLittleEndian())
1519
1520 if (Subtarget->hasSVE2() ||
1521 (Subtarget->hasSME() && Subtarget->isStreaming()))
1522 // For SLI/SRI.
1524 }
1525
1526 // Illegal unpacked integer vector types.
1527 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1530 }
1531
1532 // Type legalize unpacked bitcasts.
1533 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1535
1536 for (auto VT :
1537 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1538 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1540
1541 for (auto VT :
1542 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1550
1554
1555 // There are no legal MVT::nxv16f## based types.
1556 if (VT != MVT::nxv16i1) {
1559 }
1560 }
1561
1562 // NEON doesn't support masked loads/stores, but SME and SVE do.
1563 for (auto VT :
1564 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1565 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1566 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1569 }
1570
1571 // Firstly, exclude all scalable vector extending loads/truncating stores,
1572 // include both integer and floating scalable vector.
1574 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1575 setTruncStoreAction(VT, InnerVT, Expand);
1576 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1577 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1578 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1579 }
1580 }
1581
1582 // Then, selectively enable those which we directly support.
1583 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1584 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1585 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1586 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1587 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1588 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1589 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1590 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1591 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1592 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1593 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1594 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1595 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1596 }
1597
1598 // SVE supports truncating stores of 64 and 128-bit vectors
1599 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1600 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1601 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1602 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1603 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1604
1605 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1606 MVT::nxv4f32, MVT::nxv2f64}) {
1646
1668
1680 }
1681
1682 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1693
1694 if (Subtarget->hasSVEB16B16()) {
1703 }
1704 }
1705
1706 for (auto Opcode :
1709 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1710 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1711 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1712 }
1713
1714 if (!Subtarget->hasSVEB16B16()) {
1715 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1717 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1718 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1719 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1720 }
1721 }
1722
1725
1726 // NEON doesn't support integer divides, but SVE does
1727 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1728 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1731 }
1732
1733 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1734 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1735 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1736
1737 // NOTE: Currently this has to happen after computeRegisterProperties rather
1738 // than the preferred option of combining it with the addRegisterClass call.
1739 if (Subtarget->useSVEForFixedLengthVectors()) {
1742 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1743 addTypeForFixedLengthSVE(VT);
1744 }
1747 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1748 addTypeForFixedLengthSVE(VT);
1749 }
1750
1751 // 64bit results can mean a bigger than NEON input.
1752 for (auto VT : {MVT::v8i8, MVT::v4i16})
1755
1756 // 128bit results imply a bigger than NEON input.
1757 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1759 for (auto VT : {MVT::v8f16, MVT::v4f32})
1761
1762 // These operations are not supported on NEON but SVE can do them.
1764 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1765 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1766 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1767 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1768 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1769 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1770 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1771 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1772 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1773 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1774 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1775 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1776 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1777 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1778 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1783
1784 // Int operations with no NEON support.
1785 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1786 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1794 }
1795
1796 // Use SVE for vectors with more than 2 elements.
1797 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1799 }
1800
1801 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1802 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1803 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1804 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1805
1807
1808 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1810 }
1811
1812 // Handle operations that are only available in non-streaming SVE mode.
1813 if (Subtarget->isSVEAvailable()) {
1814 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1815 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1816 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1817 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1818 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1819 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1820 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1823 }
1824
1825 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1826 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1827 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1829
1830 // We can lower types that have <vscale x {2|4}> elements to compact.
1831 for (auto VT :
1832 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1833 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1835
1836 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1837 // NEON vectors in the lowest bits of the SVE register.
1838 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1839 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1841
1842 // Histcnt is SVE2 only
1843 if (Subtarget->hasSVE2()) {
1845 Custom);
1847 Custom);
1848 }
1849 }
1850
1851
1852 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1853 // Only required for llvm.aarch64.mops.memset.tag
1855 }
1856
1858
1859 if (Subtarget->hasSVE()) {
1864 }
1865
1866 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1867
1868 IsStrictFPEnabled = true;
1870
1871 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1872 // it, but it's just a wrapper around ldexp.
1873 if (Subtarget->isTargetWindows()) {
1875 if (isOperationExpand(Op, MVT::f32))
1876 setOperationAction(Op, MVT::f32, Promote);
1877 }
1878
1879 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1880 // isn't legal.
1882 if (isOperationExpand(Op, MVT::f16))
1883 setOperationAction(Op, MVT::f16, Promote);
1884
1885 if (Subtarget->isWindowsArm64EC()) {
1886 // FIXME: are there intrinsics we need to exclude from this?
1887 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1888 auto code = static_cast<RTLIB::Libcall>(i);
1889 auto libcallName = getLibcallName(code);
1890 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1891 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1892 }
1893 }
1894 }
1895}
1896
1897void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1898 assert(VT.isVector() && "VT should be a vector type");
1899
1900 if (VT.isFloatingPoint()) {
1902 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1903 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1904 }
1905
1906 // Mark vector float intrinsics as expand.
1907 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1925 }
1926
1927 // But we do support custom-lowering for FCOPYSIGN.
1928 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1929 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1930 VT == MVT::v8f16) &&
1931 Subtarget->hasFullFP16()))
1933
1946
1950 for (MVT InnerVT : MVT::all_valuetypes())
1951 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1952
1953 // CNT supports only B element sizes, then use UADDLP to widen.
1954 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1956
1962
1963 for (unsigned Opcode :
1966 setOperationAction(Opcode, VT, Custom);
1967
1968 if (!VT.isFloatingPoint())
1970
1971 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1972 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1973 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1974 setOperationAction(Opcode, VT, Legal);
1975
1976 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1977 // NEON types.
1978 if (VT.isFloatingPoint() &&
1979 VT.getVectorElementType() != MVT::bf16 &&
1980 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1981 for (unsigned Opcode :
1987 setOperationAction(Opcode, VT, Legal);
1988
1989 // Strict fp extend and trunc are legal
1990 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1992 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1994
1995 // FIXME: We could potentially make use of the vector comparison instructions
1996 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1997 // complications:
1998 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1999 // so we would need to expand when the condition code doesn't match the
2000 // kind of comparison.
2001 // * Some kinds of comparison require more than one FCMXY instruction so
2002 // would need to be expanded instead.
2003 // * The lowering of the non-strict versions involves target-specific ISD
2004 // nodes so we would likely need to add strict versions of all of them and
2005 // handle them appropriately.
2008
2009 if (Subtarget->isLittleEndian()) {
2010 for (unsigned im = (unsigned)ISD::PRE_INC;
2014 }
2015 }
2016
2017 if (Subtarget->hasD128()) {
2020 }
2021}
2022
2024 EVT OpVT) const {
2025 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2026 if (!Subtarget->hasSVE())
2027 return true;
2028
2029 // We can only support legal predicate result types. We can use the SVE
2030 // whilelo instruction for generating fixed-width predicates too.
2031 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2032 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2033 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2034 return true;
2035
2036 // The whilelo instruction only works with i32 or i64 scalar inputs.
2037 if (OpVT != MVT::i32 && OpVT != MVT::i64)
2038 return true;
2039
2040 return false;
2041}
2042
2044 const IntrinsicInst *I) const {
2045 if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2046 return true;
2047
2048 EVT VT = EVT::getEVT(I->getType());
2049 auto Op1 = I->getOperand(1);
2050 EVT Op1VT = EVT::getEVT(Op1->getType());
2051 if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
2052 (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
2053 VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
2054 return false;
2055 return true;
2056}
2057
2059 if (!Subtarget->isSVEorStreamingSVEAvailable())
2060 return true;
2061
2062 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2063 // also support fixed-width predicates.
2064 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2065 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2066 VT != MVT::v4i1 && VT != MVT::v2i1;
2067}
2068
2070 unsigned SearchSize) const {
2071 // MATCH is SVE2 and only available in non-streaming mode.
2072 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2073 return true;
2074 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2075 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2076 return SearchSize != 8;
2077 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2078 return SearchSize != 8 && SearchSize != 16;
2079 return true;
2080}
2081
2082void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2083 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2084
2085 // By default everything must be expanded.
2086 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2088
2089 if (VT.isFloatingPoint()) {
2099 }
2100
2102 VT == MVT::v1f64 ? Expand : Custom;
2103
2104 // Mark integer truncating stores/extending loads as having custom lowering
2105 if (VT.isInteger()) {
2106 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2107 while (InnerVT != VT) {
2108 setTruncStoreAction(VT, InnerVT, Default);
2109 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2110 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2111 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2112 InnerVT = InnerVT.changeVectorElementType(
2113 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2114 }
2115 }
2116
2117 // Mark floating-point truncating stores/extending loads as having custom
2118 // lowering
2119 if (VT.isFloatingPoint()) {
2120 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2121 while (InnerVT != VT) {
2122 setTruncStoreAction(VT, InnerVT, Custom);
2123 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2124 InnerVT = InnerVT.changeVectorElementType(
2126 }
2127 }
2128
2129 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2130 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2131
2132 // Lower fixed length vector operations to scalable equivalents.
2139 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2176 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2177 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2179 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2198 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2224}
2225
2226void AArch64TargetLowering::addDRType(MVT VT) {
2227 addRegisterClass(VT, &AArch64::FPR64RegClass);
2228 if (Subtarget->isNeonAvailable())
2229 addTypeForNEON(VT);
2230}
2231
2232void AArch64TargetLowering::addQRType(MVT VT) {
2233 addRegisterClass(VT, &AArch64::FPR128RegClass);
2234 if (Subtarget->isNeonAvailable())
2235 addTypeForNEON(VT);
2236}
2237
2239 LLVMContext &C, EVT VT) const {
2240 if (!VT.isVector())
2241 return MVT::i32;
2242 if (VT.isScalableVector())
2243 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2245}
2246
2247// isIntImmediate - This method tests to see if the node is a constant
2248// operand. If so Imm will receive the value.
2249static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2250 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2251 Imm = C->getZExtValue();
2252 return true;
2253 }
2254 return false;
2255}
2256
2257// isOpcWithIntImmediate - This method tests to see if the node is a specific
2258// opcode and that it has a immediate integer right operand.
2259// If so Imm will receive the value.
2260static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2261 uint64_t &Imm) {
2262 return N->getOpcode() == Opc &&
2263 isIntImmediate(N->getOperand(1).getNode(), Imm);
2264}
2265
2266static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2267 const APInt &Demanded,
2269 unsigned NewOpc) {
2270 uint64_t OldImm = Imm, NewImm, Enc;
2271 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2272
2273 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2274 // bimm64.
2275 if (Imm == 0 || Imm == Mask ||
2277 return false;
2278
2279 unsigned EltSize = Size;
2280 uint64_t DemandedBits = Demanded.getZExtValue();
2281
2282 // Clear bits that are not demanded.
2283 Imm &= DemandedBits;
2284
2285 while (true) {
2286 // The goal here is to set the non-demanded bits in a way that minimizes
2287 // the number of switching between 0 and 1. In order to achieve this goal,
2288 // we set the non-demanded bits to the value of the preceding demanded bits.
2289 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2290 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2291 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2292 // The final result is 0b11000011.
2293 uint64_t NonDemandedBits = ~DemandedBits;
2294 uint64_t InvertedImm = ~Imm & DemandedBits;
2295 uint64_t RotatedImm =
2296 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2297 NonDemandedBits;
2298 uint64_t Sum = RotatedImm + NonDemandedBits;
2299 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2300 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2301 NewImm = (Imm | Ones) & Mask;
2302
2303 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2304 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2305 // we halve the element size and continue the search.
2306 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2307 break;
2308
2309 // We cannot shrink the element size any further if it is 2-bits.
2310 if (EltSize == 2)
2311 return false;
2312
2313 EltSize /= 2;
2314 Mask >>= EltSize;
2315 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2316
2317 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2318 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2319 return false;
2320
2321 // Merge the upper and lower halves of Imm and DemandedBits.
2322 Imm |= Hi;
2323 DemandedBits |= DemandedBitsHi;
2324 }
2325
2326 ++NumOptimizedImms;
2327
2328 // Replicate the element across the register width.
2329 while (EltSize < Size) {
2330 NewImm |= NewImm << EltSize;
2331 EltSize *= 2;
2332 }
2333
2334 (void)OldImm;
2335 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2336 "demanded bits should never be altered");
2337 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2338
2339 // Create the new constant immediate node.
2340 EVT VT = Op.getValueType();
2341 SDLoc DL(Op);
2342 SDValue New;
2343
2344 // If the new constant immediate is all-zeros or all-ones, let the target
2345 // independent DAG combine optimize this node.
2346 if (NewImm == 0 || NewImm == OrigMask) {
2347 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2348 TLO.DAG.getConstant(NewImm, DL, VT));
2349 // Otherwise, create a machine node so that target independent DAG combine
2350 // doesn't undo this optimization.
2351 } else {
2353 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2354 New = SDValue(
2355 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2356 }
2357
2358 return TLO.CombineTo(Op, New);
2359}
2360
2362 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2363 TargetLoweringOpt &TLO) const {
2364 // Delay this optimization to as late as possible.
2365 if (!TLO.LegalOps)
2366 return false;
2367
2369 return false;
2370
2371 EVT VT = Op.getValueType();
2372 if (VT.isVector())
2373 return false;
2374
2375 unsigned Size = VT.getSizeInBits();
2376 assert((Size == 32 || Size == 64) &&
2377 "i32 or i64 is expected after legalization.");
2378
2379 // Exit early if we demand all bits.
2380 if (DemandedBits.popcount() == Size)
2381 return false;
2382
2383 unsigned NewOpc;
2384 switch (Op.getOpcode()) {
2385 default:
2386 return false;
2387 case ISD::AND:
2388 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2389 break;
2390 case ISD::OR:
2391 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2392 break;
2393 case ISD::XOR:
2394 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2395 break;
2396 }
2397 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2398 if (!C)
2399 return false;
2400 uint64_t Imm = C->getZExtValue();
2401 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2402}
2403
2404/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2405/// Mask are known to be either zero or one and return them Known.
2407 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2408 const SelectionDAG &DAG, unsigned Depth) const {
2409 switch (Op.getOpcode()) {
2410 default:
2411 break;
2412 case AArch64ISD::DUP: {
2413 SDValue SrcOp = Op.getOperand(0);
2414 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2415 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2416 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2417 "Expected DUP implicit truncation");
2418 Known = Known.trunc(Op.getScalarValueSizeInBits());
2419 }
2420 break;
2421 }
2422 case AArch64ISD::CSEL: {
2423 KnownBits Known2;
2424 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2425 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2426 Known = Known.intersectWith(Known2);
2427 break;
2428 }
2429 case AArch64ISD::BICi: {
2430 // Compute the bit cleared value.
2431 APInt Mask =
2432 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2433 .trunc(Known.getBitWidth());
2434 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2435 Known &= KnownBits::makeConstant(Mask);
2436 break;
2437 }
2438 case AArch64ISD::VLSHR: {
2439 KnownBits Known2;
2440 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2441 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2442 Known = KnownBits::lshr(Known, Known2);
2443 break;
2444 }
2445 case AArch64ISD::VASHR: {
2446 KnownBits Known2;
2447 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2448 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2449 Known = KnownBits::ashr(Known, Known2);
2450 break;
2451 }
2452 case AArch64ISD::VSHL: {
2453 KnownBits Known2;
2454 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2455 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2456 Known = KnownBits::shl(Known, Known2);
2457 break;
2458 }
2459 case AArch64ISD::MOVI: {
2461 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2462 break;
2463 }
2465 case AArch64ISD::ADDlow: {
2466 if (!Subtarget->isTargetILP32())
2467 break;
2468 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2469 Known.Zero = APInt::getHighBitsSet(64, 32);
2470 break;
2471 }
2473 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2474 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2475 break;
2476 }
2478 Intrinsic::ID IntID =
2479 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2480 switch (IntID) {
2481 default: return;
2482 case Intrinsic::aarch64_ldaxr:
2483 case Intrinsic::aarch64_ldxr: {
2484 unsigned BitWidth = Known.getBitWidth();
2485 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2486 unsigned MemBits = VT.getScalarSizeInBits();
2487 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2488 return;
2489 }
2490 }
2491 break;
2492 }
2494 case ISD::INTRINSIC_VOID: {
2495 unsigned IntNo = Op.getConstantOperandVal(0);
2496 switch (IntNo) {
2497 default:
2498 break;
2499 case Intrinsic::aarch64_neon_uaddlv: {
2500 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2501 unsigned BitWidth = Known.getBitWidth();
2502 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2503 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2504 assert(BitWidth >= Bound && "Unexpected width!");
2506 Known.Zero |= Mask;
2507 }
2508 break;
2509 }
2510 case Intrinsic::aarch64_neon_umaxv:
2511 case Intrinsic::aarch64_neon_uminv: {
2512 // Figure out the datatype of the vector operand. The UMINV instruction
2513 // will zero extend the result, so we can mark as known zero all the
2514 // bits larger than the element datatype. 32-bit or larget doesn't need
2515 // this as those are legal types and will be handled by isel directly.
2516 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2517 unsigned BitWidth = Known.getBitWidth();
2518 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2519 assert(BitWidth >= 8 && "Unexpected width!");
2521 Known.Zero |= Mask;
2522 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2523 assert(BitWidth >= 16 && "Unexpected width!");
2525 Known.Zero |= Mask;
2526 }
2527 break;
2528 } break;
2529 }
2530 }
2531 }
2532}
2533
2535 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2536 unsigned Depth) const {
2537 EVT VT = Op.getValueType();
2538 unsigned VTBits = VT.getScalarSizeInBits();
2539 unsigned Opcode = Op.getOpcode();
2540 switch (Opcode) {
2541 case AArch64ISD::CMEQ:
2542 case AArch64ISD::CMGE:
2543 case AArch64ISD::CMGT:
2544 case AArch64ISD::CMHI:
2545 case AArch64ISD::CMHS:
2546 case AArch64ISD::FCMEQ:
2547 case AArch64ISD::FCMGE:
2548 case AArch64ISD::FCMGT:
2549 case AArch64ISD::CMEQz:
2550 case AArch64ISD::CMGEz:
2551 case AArch64ISD::CMGTz:
2552 case AArch64ISD::CMLEz:
2553 case AArch64ISD::CMLTz:
2554 case AArch64ISD::FCMEQz:
2555 case AArch64ISD::FCMGEz:
2556 case AArch64ISD::FCMGTz:
2557 case AArch64ISD::FCMLEz:
2558 case AArch64ISD::FCMLTz:
2559 // Compares return either 0 or all-ones
2560 return VTBits;
2561 case AArch64ISD::VASHR: {
2562 unsigned Tmp =
2563 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2564 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2565 }
2566 }
2567
2568 return 1;
2569}
2570
2572 EVT) const {
2573 return MVT::i64;
2574}
2575
2577 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2578 unsigned *Fast) const {
2579
2580 // Allow SVE loads/stores where the alignment >= the size of the element type,
2581 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2582 // for stores that come from IR, only require element-size alignment (even if
2583 // unaligned accesses are disabled). Without this, these will be forced to
2584 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2585 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2586 if (VT.isScalableVector()) {
2587 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2588 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2589 return true;
2590 }
2591
2592 if (Subtarget->requiresStrictAlign())
2593 return false;
2594
2595 if (Fast) {
2596 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2597 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2598 // See comments in performSTORECombine() for more details about
2599 // these conditions.
2600
2601 // Code that uses clang vector extensions can mark that it
2602 // wants unaligned accesses to be treated as fast by
2603 // underspecifying alignment to be 1 or 2.
2604 Alignment <= 2 ||
2605
2606 // Disregard v2i64. Memcpy lowering produces those and splitting
2607 // them regresses performance on micro-benchmarks and olden/bh.
2608 VT == MVT::v2i64;
2609 }
2610 return true;
2611}
2612
2613// Same as above but handling LLTs instead.
2615 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2616 unsigned *Fast) const {
2617 if (Subtarget->requiresStrictAlign())
2618 return false;
2619
2620 if (Fast) {
2621 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2622 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2623 Ty.getSizeInBytes() != 16 ||
2624 // See comments in performSTORECombine() for more details about
2625 // these conditions.
2626
2627 // Code that uses clang vector extensions can mark that it
2628 // wants unaligned accesses to be treated as fast by
2629 // underspecifying alignment to be 1 or 2.
2630 Alignment <= 2 ||
2631
2632 // Disregard v2i64. Memcpy lowering produces those and splitting
2633 // them regresses performance on micro-benchmarks and olden/bh.
2634 Ty == LLT::fixed_vector(2, 64);
2635 }
2636 return true;
2637}
2638
2639FastISel *
2641 const TargetLibraryInfo *libInfo) const {
2642 return AArch64::createFastISel(funcInfo, libInfo);
2643}
2644
2645const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2646#define MAKE_CASE(V) \
2647 case V: \
2648 return #V;
2649 switch ((AArch64ISD::NodeType)Opcode) {
2651 break;
2980 }
2981#undef MAKE_CASE
2982 return nullptr;
2983}
2984
2987 MachineBasicBlock *MBB) const {
2988 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2989 // phi node:
2990
2991 // OrigBB:
2992 // [... previous instrs leading to comparison ...]
2993 // b.ne TrueBB
2994 // b EndBB
2995 // TrueBB:
2996 // ; Fallthrough
2997 // EndBB:
2998 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2999
3000 MachineFunction *MF = MBB->getParent();
3001 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3002 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3003 DebugLoc DL = MI.getDebugLoc();
3005
3006 Register DestReg = MI.getOperand(0).getReg();
3007 Register IfTrueReg = MI.getOperand(1).getReg();
3008 Register IfFalseReg = MI.getOperand(2).getReg();
3009 unsigned CondCode = MI.getOperand(3).getImm();
3010 bool NZCVKilled = MI.getOperand(4).isKill();
3011
3012 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3013 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3014 MF->insert(It, TrueBB);
3015 MF->insert(It, EndBB);
3016
3017 // Transfer rest of current basic-block to EndBB
3018 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3019 MBB->end());
3021
3022 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3023 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3024 MBB->addSuccessor(TrueBB);
3025 MBB->addSuccessor(EndBB);
3026
3027 // TrueBB falls through to the end.
3028 TrueBB->addSuccessor(EndBB);
3029
3030 if (!NZCVKilled) {
3031 TrueBB->addLiveIn(AArch64::NZCV);
3032 EndBB->addLiveIn(AArch64::NZCV);
3033 }
3034
3035 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3036 .addReg(IfTrueReg)
3037 .addMBB(TrueBB)
3038 .addReg(IfFalseReg)
3039 .addMBB(MBB);
3040
3041 MI.eraseFromParent();
3042 return EndBB;
3043}
3044
3046 MachineInstr &MI, MachineBasicBlock *BB) const {
3048 BB->getParent()->getFunction().getPersonalityFn())) &&
3049 "SEH does not use catchret!");
3050 return BB;
3051}
3052
3055 MachineBasicBlock *MBB) const {
3056 MachineFunction &MF = *MBB->getParent();
3057 MachineBasicBlock::iterator MBBI = MI.getIterator();
3059 const AArch64InstrInfo &TII =
3060 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3061 Register TargetReg = MI.getOperand(0).getReg();
3063 TII.probedStackAlloc(MBBI, TargetReg, false);
3064
3065 MI.eraseFromParent();
3066 return NextInst->getParent();
3067}
3068
3070AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3072 MachineBasicBlock *BB) const {
3073 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3074 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3075
3076 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3077 MIB.add(MI.getOperand(1)); // slice index register
3078 MIB.add(MI.getOperand(2)); // slice index offset
3079 MIB.add(MI.getOperand(3)); // pg
3080 MIB.add(MI.getOperand(4)); // base
3081 MIB.add(MI.getOperand(5)); // offset
3082
3083 MI.eraseFromParent(); // The pseudo is gone now.
3084 return BB;
3085}
3086
3089 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3091 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3092
3093 MIB.addReg(AArch64::ZA, RegState::Define);
3094 MIB.add(MI.getOperand(0)); // Vector select register
3095 MIB.add(MI.getOperand(1)); // Vector select offset
3096 MIB.add(MI.getOperand(2)); // Base
3097 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3098
3099 MI.eraseFromParent(); // The pseudo is gone now.
3100 return BB;
3101}
3102
3105 unsigned Opcode,
3106 bool Op0IsDef) const {
3107 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3109
3110 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3111 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3112 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3113 MIB.add(MI.getOperand(I));
3114
3115 MI.eraseFromParent(); // The pseudo is gone now.
3116 return BB;
3117}
3118
3120AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3122 MachineBasicBlock *BB) const {
3123 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3124 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3125 unsigned StartIdx = 0;
3126
3127 bool HasTile = BaseReg != AArch64::ZA;
3128 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3129 if (HasZPROut) {
3130 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3131 ++StartIdx;
3132 }
3133 if (HasTile) {
3134 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3135 RegState::Define); // Output ZA Tile
3136 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3137 StartIdx++;
3138 } else {
3139 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3140 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3141 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3142 ++StartIdx;
3143 }
3144 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3145 }
3146 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3147 MIB.add(MI.getOperand(I));
3148
3149 MI.eraseFromParent(); // The pseudo is gone now.
3150 return BB;
3151}
3152
3155 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3157 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3158 MIB.add(MI.getOperand(0)); // Mask
3159
3160 unsigned Mask = MI.getOperand(0).getImm();
3161 for (unsigned I = 0; I < 8; I++) {
3162 if (Mask & (1 << I))
3163 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3164 }
3165
3166 MI.eraseFromParent(); // The pseudo is gone now.
3167 return BB;
3168}
3169
3172 MachineBasicBlock *BB) const {
3173 MachineFunction *MF = BB->getParent();
3174 MachineFrameInfo &MFI = MF->getFrameInfo();
3176 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3177 if (TPIDR2.Uses > 0) {
3178 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3179 // Store the buffer pointer to the TPIDR2 stack object.
3180 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3181 .addReg(MI.getOperand(0).getReg())
3182 .addFrameIndex(TPIDR2.FrameIndex)
3183 .addImm(0);
3184 // Set the reserved bytes (10-15) to zero
3185 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3186 .addReg(AArch64::WZR)
3187 .addFrameIndex(TPIDR2.FrameIndex)
3188 .addImm(5);
3189 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3190 .addReg(AArch64::WZR)
3191 .addFrameIndex(TPIDR2.FrameIndex)
3192 .addImm(3);
3193 } else
3194 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3195
3196 BB->remove_instr(&MI);
3197 return BB;
3198}
3199
3202 MachineBasicBlock *BB) const {
3203 MachineFunction *MF = BB->getParent();
3204 MachineFrameInfo &MFI = MF->getFrameInfo();
3206 // TODO This function grows the stack with a subtraction, which doesn't work
3207 // on Windows. Some refactoring to share the functionality in
3208 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3209 // supports SME
3211 "Lazy ZA save is not yet supported on Windows");
3212
3213 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3214
3215 if (TPIDR2.Uses > 0) {
3216 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3218
3219 // The SUBXrs below won't always be emitted in a form that accepts SP
3220 // directly
3221 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3222 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3223 .addReg(AArch64::SP);
3224
3225 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3226 auto Size = MI.getOperand(1).getReg();
3227 auto Dest = MI.getOperand(0).getReg();
3228 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3229 .addReg(Size)
3230 .addReg(Size)
3231 .addReg(SP);
3232 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3233 AArch64::SP)
3234 .addReg(Dest);
3235
3236 // We have just allocated a variable sized object, tell this to PEI.
3237 MFI.CreateVariableSizedObject(Align(16), nullptr);
3238 }
3239
3240 BB->remove_instr(&MI);
3241 return BB;
3242}
3243
3244// TODO: Find a way to merge this with EmitAllocateZABuffer.
3247 MachineBasicBlock *BB) const {
3248 MachineFunction *MF = BB->getParent();
3249 MachineFrameInfo &MFI = MF->getFrameInfo();
3252 "Lazy ZA save is not yet supported on Windows");
3253
3254 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3255 if (FuncInfo->isSMESaveBufferUsed()) {
3256 // Allocate a buffer object of the size given by MI.getOperand(1).
3257 auto Size = MI.getOperand(1).getReg();
3258 auto Dest = MI.getOperand(0).getReg();
3259 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3260 .addReg(AArch64::SP)
3261 .addReg(Size)
3263 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3264 .addReg(AArch64::SP);
3265
3266 // We have just allocated a variable sized object, tell this to PEI.
3267 MFI.CreateVariableSizedObject(Align(16), nullptr);
3268 } else
3269 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3270 MI.getOperand(0).getReg());
3271
3272 BB->remove_instr(&MI);
3273 return BB;
3274}
3275
3278 MachineBasicBlock *BB) const {
3279 // If the buffer is used, emit a call to __arm_sme_state_size()
3280 MachineFunction *MF = BB->getParent();
3282 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3283 if (FuncInfo->isSMESaveBufferUsed()) {
3284 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3285 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3286 .addExternalSymbol("__arm_sme_state_size")
3287 .addReg(AArch64::X0, RegState::ImplicitDefine)
3288 .addRegMask(TRI->getCallPreservedMask(
3289 *MF, CallingConv::
3291 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3292 MI.getOperand(0).getReg())
3293 .addReg(AArch64::X0);
3294 } else
3295 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3296 MI.getOperand(0).getReg())
3297 .addReg(AArch64::XZR);
3298 BB->remove_instr(&MI);
3299 return BB;
3300}
3301
3303 MachineInstr &MI, MachineBasicBlock *BB) const {
3304
3305 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3306 if (SMEOrigInstr != -1) {
3307 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3308 uint64_t SMEMatrixType =
3309 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3310 switch (SMEMatrixType) {
3312 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3314 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3316 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3318 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3320 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3322 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3323 }
3324 }
3325
3326 switch (MI.getOpcode()) {
3327 default:
3328#ifndef NDEBUG
3329 MI.dump();
3330#endif
3331 llvm_unreachable("Unexpected instruction for custom inserter!");
3332 case AArch64::InitTPIDR2Obj:
3333 return EmitInitTPIDR2Object(MI, BB);
3334 case AArch64::AllocateZABuffer:
3335 return EmitAllocateZABuffer(MI, BB);
3336 case AArch64::AllocateSMESaveBuffer:
3337 return EmitAllocateSMESaveBuffer(MI, BB);
3338 case AArch64::GetSMESaveSize:
3339 return EmitGetSMESaveSize(MI, BB);
3340 case AArch64::F128CSEL:
3341 return EmitF128CSEL(MI, BB);
3342 case TargetOpcode::STATEPOINT:
3343 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3344 // while bl call instruction (where statepoint will be lowered at the end)
3345 // has implicit def. This def is early-clobber as it will be set at
3346 // the moment of the call and earlier than any use is read.
3347 // Add this implicit dead def here as a workaround.
3348 MI.addOperand(*MI.getMF(),
3350 AArch64::LR, /*isDef*/ true,
3351 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3352 /*isUndef*/ false, /*isEarlyClobber*/ true));
3353 [[fallthrough]];
3354 case TargetOpcode::STACKMAP:
3355 case TargetOpcode::PATCHPOINT:
3356 return emitPatchPoint(MI, BB);
3357
3358 case TargetOpcode::PATCHABLE_EVENT_CALL:
3359 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3360 return BB;
3361
3362 case AArch64::CATCHRET:
3363 return EmitLoweredCatchRet(MI, BB);
3364
3365 case AArch64::PROBED_STACKALLOC_DYN:
3366 return EmitDynamicProbedAlloc(MI, BB);
3367
3368 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3369 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3370 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3371 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3372 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3373 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3374 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3375 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3376 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3377 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3378 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3379 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3380 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3381 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3382 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3383 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3384 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3385 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3386 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3387 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3388 case AArch64::LDR_ZA_PSEUDO:
3389 return EmitFill(MI, BB);
3390 case AArch64::LDR_TX_PSEUDO:
3391 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3392 case AArch64::STR_TX_PSEUDO:
3393 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3394 case AArch64::ZERO_M_PSEUDO:
3395 return EmitZero(MI, BB);
3396 case AArch64::ZERO_T_PSEUDO:
3397 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3398 case AArch64::MOVT_TIZ_PSEUDO:
3399 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3400 }
3401}
3402
3403//===----------------------------------------------------------------------===//
3404// AArch64 Lowering private implementation.
3405//===----------------------------------------------------------------------===//
3406
3407//===----------------------------------------------------------------------===//
3408// Lowering Code
3409//===----------------------------------------------------------------------===//
3410
3411// Forward declarations of SVE fixed length lowering helpers
3416 SelectionDAG &DAG);
3419 EVT VT);
3420
3421/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3422static bool isZerosVector(const SDNode *N) {
3423 // Look through a bit convert.
3424 while (N->getOpcode() == ISD::BITCAST)
3425 N = N->getOperand(0).getNode();
3426
3428 return true;
3429
3430 if (N->getOpcode() != AArch64ISD::DUP)
3431 return false;
3432
3433 auto Opnd0 = N->getOperand(0);
3434 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3435}
3436
3437/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3438/// CC
3440 switch (CC) {
3441 default:
3442 llvm_unreachable("Unknown condition code!");
3443 case ISD::SETNE:
3444 return AArch64CC::NE;
3445 case ISD::SETEQ:
3446 return AArch64CC::EQ;
3447 case ISD::SETGT:
3448 return AArch64CC::GT;
3449 case ISD::SETGE:
3450 return AArch64CC::GE;
3451 case ISD::SETLT:
3452 return AArch64CC::LT;
3453 case ISD::SETLE:
3454 return AArch64CC::LE;
3455 case ISD::SETUGT:
3456 return AArch64CC::HI;
3457 case ISD::SETUGE:
3458 return AArch64CC::HS;
3459 case ISD::SETULT:
3460 return AArch64CC::LO;
3461 case ISD::SETULE:
3462 return AArch64CC::LS;
3463 }
3464}
3465
3466/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3468 AArch64CC::CondCode &CondCode,
3469 AArch64CC::CondCode &CondCode2) {
3470 CondCode2 = AArch64CC::AL;
3471 switch (CC) {
3472 default:
3473 llvm_unreachable("Unknown FP condition!");
3474 case ISD::SETEQ:
3475 case ISD::SETOEQ:
3476 CondCode = AArch64CC::EQ;
3477 break;
3478 case ISD::SETGT:
3479 case ISD::SETOGT:
3480 CondCode = AArch64CC::GT;
3481 break;
3482 case ISD::SETGE:
3483 case ISD::SETOGE:
3484 CondCode = AArch64CC::GE;
3485 break;
3486 case ISD::SETOLT:
3487 CondCode = AArch64CC::MI;
3488 break;
3489 case ISD::SETOLE:
3490 CondCode = AArch64CC::LS;
3491 break;
3492 case ISD::SETONE:
3493 CondCode = AArch64CC::MI;
3494 CondCode2 = AArch64CC::GT;
3495 break;
3496 case ISD::SETO:
3497 CondCode = AArch64CC::VC;
3498 break;
3499 case ISD::SETUO:
3500 CondCode = AArch64CC::VS;
3501 break;
3502 case ISD::SETUEQ:
3503 CondCode = AArch64CC::EQ;
3504 CondCode2 = AArch64CC::VS;
3505 break;
3506 case ISD::SETUGT:
3507 CondCode = AArch64CC::HI;
3508 break;
3509 case ISD::SETUGE:
3510 CondCode = AArch64CC::PL;
3511 break;
3512 case ISD::SETLT:
3513 case ISD::SETULT:
3514 CondCode = AArch64CC::LT;
3515 break;
3516 case ISD::SETLE:
3517 case ISD::SETULE:
3518 CondCode = AArch64CC::LE;
3519 break;
3520 case ISD::SETNE:
3521 case ISD::SETUNE:
3522 CondCode = AArch64CC::NE;
3523 break;
3524 }
3525}
3526
3527/// Convert a DAG fp condition code to an AArch64 CC.
3528/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3529/// should be AND'ed instead of OR'ed.
3531 AArch64CC::CondCode &CondCode,
3532 AArch64CC::CondCode &CondCode2) {
3533 CondCode2 = AArch64CC::AL;
3534 switch (CC) {
3535 default:
3536 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3537 assert(CondCode2 == AArch64CC::AL);
3538 break;
3539 case ISD::SETONE:
3540 // (a one b)
3541 // == ((a olt b) || (a ogt b))
3542 // == ((a ord b) && (a une b))
3543 CondCode = AArch64CC::VC;
3544 CondCode2 = AArch64CC::NE;
3545 break;
3546 case ISD::SETUEQ:
3547 // (a ueq b)
3548 // == ((a uno b) || (a oeq b))
3549 // == ((a ule b) && (a uge b))
3550 CondCode = AArch64CC::PL;
3551 CondCode2 = AArch64CC::LE;
3552 break;
3553 }
3554}
3555
3556/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3557/// CC usable with the vector instructions. Fewer operations are available
3558/// without a real NZCV register, so we have to use less efficient combinations
3559/// to get the same effect.
3561 AArch64CC::CondCode &CondCode,
3562 AArch64CC::CondCode &CondCode2,
3563 bool &Invert) {
3564 Invert = false;
3565 switch (CC) {
3566 default:
3567 // Mostly the scalar mappings work fine.
3568 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3569 break;
3570 case ISD::SETUO:
3571 Invert = true;
3572 [[fallthrough]];
3573 case ISD::SETO:
3574 CondCode = AArch64CC::MI;
3575 CondCode2 = AArch64CC::GE;
3576 break;
3577 case ISD::SETUEQ:
3578 case ISD::SETULT:
3579 case ISD::SETULE:
3580 case ISD::SETUGT:
3581 case ISD::SETUGE:
3582 // All of the compare-mask comparisons are ordered, but we can switch
3583 // between the two by a double inversion. E.g. ULE == !OGT.
3584 Invert = true;
3585 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3586 CondCode, CondCode2);
3587 break;
3588 }
3589}
3590
3592 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3593 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3594 LLVM_DEBUG(dbgs() << "Is imm " << C
3595 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3596 return IsLegal;
3597}
3598
3599static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3600 KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
3601 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3602}
3603
3604// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3605// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3606// can be set differently by this operation. It comes down to whether
3607// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3608// everything is fine. If not then the optimization is wrong. Thus general
3609// comparisons are only valid if op2 != 0.
3610//
3611// So, finally, the only LLVM-native comparisons that don't mention C or V
3612// are the ones that aren't unsigned comparisons. They're the only ones we can
3613// safely use CMN for in the absence of information about op2.
3615 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3616 (isIntEqualitySetCC(CC) ||
3617 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3618 (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
3619}
3620
3622 SelectionDAG &DAG, SDValue Chain,
3623 bool IsSignaling) {
3624 EVT VT = LHS.getValueType();
3625 assert(VT != MVT::f128);
3626
3627 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3628
3629 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3630 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3631 {Chain, LHS});
3632 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3633 {LHS.getValue(1), RHS});
3634 Chain = RHS.getValue(1);
3635 }
3636 unsigned Opcode =
3638 return DAG.getNode(Opcode, dl, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
3639}
3640
3642 const SDLoc &dl, SelectionDAG &DAG) {
3643 EVT VT = LHS.getValueType();
3644 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3645
3646 if (VT.isFloatingPoint()) {
3647 assert(VT != MVT::f128);
3648 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3649 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3650 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3651 }
3652 return DAG.getNode(AArch64ISD::FCMP, dl, MVT::i32, LHS, RHS);
3653 }
3654
3655 // The CMP instruction is just an alias for SUBS, and representing it as
3656 // SUBS means that it's possible to get CSE with subtract operations.
3657 // A later phase can perform the optimization of setting the destination
3658 // register to WZR/XZR if it ends up being unused.
3659 unsigned Opcode = AArch64ISD::SUBS;
3660
3661 if (isCMN(RHS, CC, DAG)) {
3662 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3663 Opcode = AArch64ISD::ADDS;
3664 RHS = RHS.getOperand(1);
3665 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3666 isIntEqualitySetCC(CC)) {
3667 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3668 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3669 Opcode = AArch64ISD::ADDS;
3670 LHS = LHS.getOperand(1);
3671 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3672 if (LHS.getOpcode() == ISD::AND) {
3673 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3674 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3675 // of the signed comparisons.
3676 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3677 DAG.getVTList(VT, MVT_CC),
3678 LHS.getOperand(0),
3679 LHS.getOperand(1));
3680 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3681 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3682 return ANDSNode.getValue(1);
3683 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3684 // Use result of ANDS
3685 return LHS.getValue(1);
3686 }
3687 }
3688
3689 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3690 .getValue(1);
3691}
3692
3693/// \defgroup AArch64CCMP CMP;CCMP matching
3694///
3695/// These functions deal with the formation of CMP;CCMP;... sequences.
3696/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3697/// a comparison. They set the NZCV flags to a predefined value if their
3698/// predicate is false. This allows to express arbitrary conjunctions, for
3699/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3700/// expressed as:
3701/// cmp A
3702/// ccmp B, inv(CB), CA
3703/// check for CB flags
3704///
3705/// This naturally lets us implement chains of AND operations with SETCC
3706/// operands. And we can even implement some other situations by transforming
3707/// them:
3708/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3709/// negating the flags used in a CCMP/FCCMP operations.
3710/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3711/// by negating the flags we test for afterwards. i.e.
3712/// NEG (CMP CCMP CCCMP ...) can be implemented.
3713/// - Note that we can only ever negate all previously processed results.
3714/// What we can not implement by flipping the flags to test is a negation
3715/// of two sub-trees (because the negation affects all sub-trees emitted so
3716/// far, so the 2nd sub-tree we emit would also affect the first).
3717/// With those tools we can implement some OR operations:
3718/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3719/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3720/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3721/// elimination rules from earlier to implement the whole thing as a
3722/// CCMP/FCCMP chain.
3723///
3724/// As complete example:
3725/// or (or (setCA (cmp A)) (setCB (cmp B)))
3726/// (and (setCC (cmp C)) (setCD (cmp D)))"
3727/// can be reassociated to:
3728/// or (and (setCC (cmp C)) setCD (cmp D))
3729// (or (setCA (cmp A)) (setCB (cmp B)))
3730/// can be transformed to:
3731/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3732/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3733/// which can be implemented as:
3734/// cmp C
3735/// ccmp D, inv(CD), CC
3736/// ccmp A, CA, inv(CD)
3737/// ccmp B, CB, inv(CA)
3738/// check for CB flags
3739///
3740/// A counterexample is "or (and A B) (and C D)" which translates to
3741/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3742/// can only implement 1 of the inner (not) operations, but not both!
3743/// @{
3744
3745/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3747 ISD::CondCode CC, SDValue CCOp,
3748 AArch64CC::CondCode Predicate,
3749 AArch64CC::CondCode OutCC,
3750 const SDLoc &DL, SelectionDAG &DAG) {
3751 unsigned Opcode = 0;
3752 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3753
3754 if (LHS.getValueType().isFloatingPoint()) {
3755 assert(LHS.getValueType() != MVT::f128);
3756 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3757 LHS.getValueType() == MVT::bf16) {
3758 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3759 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3760 }
3761 Opcode = AArch64ISD::FCCMP;
3762 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3763 APInt Imm = Const->getAPIntValue();
3764 if (Imm.isNegative() && Imm.sgt(-32)) {
3765 Opcode = AArch64ISD::CCMN;
3766 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3767 }
3768 } else if (isCMN(RHS, CC, DAG)) {
3769 Opcode = AArch64ISD::CCMN;
3770 RHS = RHS.getOperand(1);
3771 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3772 isIntEqualitySetCC(CC)) {
3773 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3774 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3775 Opcode = AArch64ISD::CCMN;
3776 LHS = LHS.getOperand(1);
3777 }
3778 if (Opcode == 0)
3779 Opcode = AArch64ISD::CCMP;
3780
3781 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3783 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3784 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3785 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3786}
3787
3788/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3789/// expressed as a conjunction. See \ref AArch64CCMP.
3790/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3791/// changing the conditions on the SETCC tests.
3792/// (this means we can call emitConjunctionRec() with
3793/// Negate==true on this sub-tree)
3794/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3795/// cannot do the negation naturally. We are required to
3796/// emit the subtree first in this case.
3797/// \param WillNegate Is true if are called when the result of this
3798/// subexpression must be negated. This happens when the
3799/// outer expression is an OR. We can use this fact to know
3800/// that we have a double negation (or (or ...) ...) that
3801/// can be implemented for free.
3802static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3803 bool &MustBeFirst, bool WillNegate,
3804 unsigned Depth = 0) {
3805 if (!Val.hasOneUse())
3806 return false;
3807 unsigned Opcode = Val->getOpcode();
3808 if (Opcode == ISD::SETCC) {
3809 if (Val->getOperand(0).getValueType() == MVT::f128)
3810 return false;
3811 CanNegate = true;
3812 MustBeFirst = false;
3813 return true;
3814 }
3815 // Protect against exponential runtime and stack overflow.
3816 if (Depth > 6)
3817 return false;
3818 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3819 bool IsOR = Opcode == ISD::OR;
3820 SDValue O0 = Val->getOperand(0);
3821 SDValue O1 = Val->getOperand(1);
3822 bool CanNegateL;
3823 bool MustBeFirstL;
3824 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3825 return false;
3826 bool CanNegateR;
3827 bool MustBeFirstR;
3828 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3829 return false;
3830
3831 if (MustBeFirstL && MustBeFirstR)
3832 return false;
3833
3834 if (IsOR) {
3835 // For an OR expression we need to be able to naturally negate at least
3836 // one side or we cannot do the transformation at all.
3837 if (!CanNegateL && !CanNegateR)
3838 return false;
3839 // If we the result of the OR will be negated and we can naturally negate
3840 // the leafs, then this sub-tree as a whole negates naturally.
3841 CanNegate = WillNegate && CanNegateL && CanNegateR;
3842 // If we cannot naturally negate the whole sub-tree, then this must be
3843 // emitted first.
3844 MustBeFirst = !CanNegate;
3845 } else {
3846 assert(Opcode == ISD::AND && "Must be OR or AND");
3847 // We cannot naturally negate an AND operation.
3848 CanNegate = false;
3849 MustBeFirst = MustBeFirstL || MustBeFirstR;
3850 }
3851 return true;
3852 }
3853 return false;
3854}
3855
3856/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3857/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3858/// Tries to transform the given i1 producing node @p Val to a series compare
3859/// and conditional compare operations. @returns an NZCV flags producing node
3860/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3861/// transformation was not possible.
3862/// \p Negate is true if we want this sub-tree being negated just by changing
3863/// SETCC conditions.
3865 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3866 AArch64CC::CondCode Predicate) {
3867 // We're at a tree leaf, produce a conditional comparison operation.
3868 unsigned Opcode = Val->getOpcode();
3869 if (Opcode == ISD::SETCC) {
3870 SDValue LHS = Val->getOperand(0);
3871 SDValue RHS = Val->getOperand(1);
3872 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3873 bool isInteger = LHS.getValueType().isInteger();
3874 if (Negate)
3875 CC = getSetCCInverse(CC, LHS.getValueType());
3876 SDLoc DL(Val);
3877 // Determine OutCC and handle FP special case.
3878 if (isInteger) {
3879 OutCC = changeIntCCToAArch64CC(CC);
3880 } else {
3881 assert(LHS.getValueType().isFloatingPoint());
3882 AArch64CC::CondCode ExtraCC;
3883 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3884 // Some floating point conditions can't be tested with a single condition
3885 // code. Construct an additional comparison in this case.
3886 if (ExtraCC != AArch64CC::AL) {
3887 SDValue ExtraCmp;
3888 if (!CCOp.getNode())
3889 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3890 else
3891 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3892 ExtraCC, DL, DAG);
3893 CCOp = ExtraCmp;
3894 Predicate = ExtraCC;
3895 }
3896 }
3897
3898 // Produce a normal comparison if we are first in the chain
3899 if (!CCOp)
3900 return emitComparison(LHS, RHS, CC, DL, DAG);
3901 // Otherwise produce a ccmp.
3902 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3903 DAG);
3904 }
3905 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3906
3907 bool IsOR = Opcode == ISD::OR;
3908
3909 SDValue LHS = Val->getOperand(0);
3910 bool CanNegateL;
3911 bool MustBeFirstL;
3912 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3913 assert(ValidL && "Valid conjunction/disjunction tree");
3914 (void)ValidL;
3915
3916 SDValue RHS = Val->getOperand(1);
3917 bool CanNegateR;
3918 bool MustBeFirstR;
3919 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3920 assert(ValidR && "Valid conjunction/disjunction tree");
3921 (void)ValidR;
3922
3923 // Swap sub-tree that must come first to the right side.
3924 if (MustBeFirstL) {
3925 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3926 std::swap(LHS, RHS);
3927 std::swap(CanNegateL, CanNegateR);
3928 std::swap(MustBeFirstL, MustBeFirstR);
3929 }
3930
3931 bool NegateR;
3932 bool NegateAfterR;
3933 bool NegateL;
3934 bool NegateAfterAll;
3935 if (Opcode == ISD::OR) {
3936 // Swap the sub-tree that we can negate naturally to the left.
3937 if (!CanNegateL) {
3938 assert(CanNegateR && "at least one side must be negatable");
3939 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3940 assert(!Negate);
3941 std::swap(LHS, RHS);
3942 NegateR = false;
3943 NegateAfterR = true;
3944 } else {
3945 // Negate the left sub-tree if possible, otherwise negate the result.
3946 NegateR = CanNegateR;
3947 NegateAfterR = !CanNegateR;
3948 }
3949 NegateL = true;
3950 NegateAfterAll = !Negate;
3951 } else {
3952 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3953 assert(!Negate && "Valid conjunction/disjunction tree");
3954
3955 NegateL = false;
3956 NegateR = false;
3957 NegateAfterR = false;
3958 NegateAfterAll = false;
3959 }
3960
3961 // Emit sub-trees.
3962 AArch64CC::CondCode RHSCC;
3963 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3964 if (NegateAfterR)
3965 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3966 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3967 if (NegateAfterAll)
3968 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3969 return CmpL;
3970}
3971
3972/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3973/// In some cases this is even possible with OR operations in the expression.
3974/// See \ref AArch64CCMP.
3975/// \see emitConjunctionRec().
3977 AArch64CC::CondCode &OutCC) {
3978 bool DummyCanNegate;
3979 bool DummyMustBeFirst;
3980 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3981 return SDValue();
3982
3983 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3984}
3985
3986/// @}
3987
3988/// Returns how profitable it is to fold a comparison's operand's shift and/or
3989/// extension operations.
3991 auto isSupportedExtend = [&](SDValue V) {
3992 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3993 return true;
3994
3995 if (V.getOpcode() == ISD::AND)
3996 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3997 uint64_t Mask = MaskCst->getZExtValue();
3998 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3999 }
4000
4001 return false;
4002 };
4003
4004 if (!Op.hasOneUse())
4005 return 0;
4006
4007 if (isSupportedExtend(Op))
4008 return 1;
4009
4010 unsigned Opc = Op.getOpcode();
4011 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4012 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4013 uint64_t Shift = ShiftCst->getZExtValue();
4014 if (isSupportedExtend(Op.getOperand(0)))
4015 return (Shift <= 4) ? 2 : 1;
4016 EVT VT = Op.getValueType();
4017 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4018 return 1;
4019 }
4020
4021 return 0;
4022}
4023
4025 SDValue &AArch64cc, SelectionDAG &DAG,
4026 const SDLoc &dl) {
4027 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4028 EVT VT = RHS.getValueType();
4029 uint64_t C = RHSC->getZExtValue();
4030 if (!isLegalArithImmed(C)) {
4031 // Constant does not fit, try adjusting it by one?
4032 switch (CC) {
4033 default:
4034 break;
4035 case ISD::SETLT:
4036 case ISD::SETGE:
4037 if ((VT == MVT::i32 && C != 0x80000000 &&
4038 isLegalArithImmed((uint32_t)(C - 1))) ||
4039 (VT == MVT::i64 && C != 0x80000000ULL &&
4040 isLegalArithImmed(C - 1ULL))) {
4042 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4043 RHS = DAG.getConstant(C, dl, VT);
4044 }
4045 break;
4046 case ISD::SETULT:
4047 case ISD::SETUGE:
4048 if ((VT == MVT::i32 && C != 0 &&
4049 isLegalArithImmed((uint32_t)(C - 1))) ||
4050 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
4052 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4053 RHS = DAG.getConstant(C, dl, VT);
4054 }
4055 break;
4056 case ISD::SETLE:
4057 case ISD::SETGT:
4058 if ((VT == MVT::i32 && C != INT32_MAX &&
4059 isLegalArithImmed((uint32_t)(C + 1))) ||
4060 (VT == MVT::i64 && C != INT64_MAX &&
4061 isLegalArithImmed(C + 1ULL))) {
4063 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4064 RHS = DAG.getConstant(C, dl, VT);
4065 }
4066 break;
4067 case ISD::SETULE:
4068 case ISD::SETUGT:
4069 if ((VT == MVT::i32 && C != UINT32_MAX &&
4070 isLegalArithImmed((uint32_t)(C + 1))) ||
4071 (VT == MVT::i64 && C != UINT64_MAX &&
4072 isLegalArithImmed(C + 1ULL))) {
4074 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4075 RHS = DAG.getConstant(C, dl, VT);
4076 }
4077 break;
4078 }
4079 }
4080 }
4081
4082 // Comparisons are canonicalized so that the RHS operand is simpler than the
4083 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4084 // can fold some shift+extend operations on the RHS operand, so swap the
4085 // operands if that can be done.
4086 //
4087 // For example:
4088 // lsl w13, w11, #1
4089 // cmp w13, w12
4090 // can be turned into:
4091 // cmp w12, w11, lsl #1
4092 if (!isa<ConstantSDNode>(RHS) ||
4093 !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
4094 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4095 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4096 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4097 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4098
4099 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4100 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4101 std::swap(LHS, RHS);
4103 }
4104 }
4105
4106 SDValue Cmp;
4107 AArch64CC::CondCode AArch64CC;
4108 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4109 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4110
4111 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4112 // For the i8 operand, the largest immediate is 255, so this can be easily
4113 // encoded in the compare instruction. For the i16 operand, however, the
4114 // largest immediate cannot be encoded in the compare.
4115 // Therefore, use a sign extending load and cmn to avoid materializing the
4116 // -1 constant. For example,
4117 // movz w1, #65535
4118 // ldrh w0, [x0, #0]
4119 // cmp w0, w1
4120 // >
4121 // ldrsh w0, [x0, #0]
4122 // cmn w0, #1
4123 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4124 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4125 // ensure both the LHS and RHS are truly zero extended and to make sure the
4126 // transformation is profitable.
4127 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4128 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4129 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4130 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4131 int16_t ValueofRHS = RHS->getAsZExtVal();
4132 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4133 SDValue SExt =
4134 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
4135 DAG.getValueType(MVT::i16));
4136 Cmp = emitComparison(
4137 SExt, DAG.getSignedConstant(ValueofRHS, dl, RHS.getValueType()), CC,
4138 dl, DAG);
4139 AArch64CC = changeIntCCToAArch64CC(CC);
4140 }
4141 }
4142
4143 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4144 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4145 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4146 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4147 }
4148 }
4149 }
4150
4151 if (!Cmp) {
4152 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4153 AArch64CC = changeIntCCToAArch64CC(CC);
4154 }
4155 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
4156 return Cmp;
4157}
4158
4159static std::pair<SDValue, SDValue>
4161 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4162 "Unsupported value type");
4163 SDValue Value, Overflow;
4164 SDLoc DL(Op);
4165 SDValue LHS = Op.getOperand(0);
4166 SDValue RHS = Op.getOperand(1);
4167 unsigned Opc = 0;
4168 switch (Op.getOpcode()) {
4169 default:
4170 llvm_unreachable("Unknown overflow instruction!");
4171 case ISD::SADDO:
4172 Opc = AArch64ISD::ADDS;
4173 CC = AArch64CC::VS;
4174 break;
4175 case ISD::UADDO:
4176 Opc = AArch64ISD::ADDS;
4177 CC = AArch64CC::HS;
4178 break;
4179 case ISD::SSUBO:
4180 Opc = AArch64ISD::SUBS;
4181 CC = AArch64CC::VS;
4182 break;
4183 case ISD::USUBO:
4184 Opc = AArch64ISD::SUBS;
4185 CC = AArch64CC::LO;
4186 break;
4187 // Multiply needs a little bit extra work.
4188 case ISD::SMULO:
4189 case ISD::UMULO: {
4190 CC = AArch64CC::NE;
4191 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4192 if (Op.getValueType() == MVT::i32) {
4193 // Extend to 64-bits, then perform a 64-bit multiply.
4194 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4195 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4196 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4197 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4198 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4199
4200 // Check that the result fits into a 32-bit integer.
4201 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
4202 if (IsSigned) {
4203 // cmp xreg, wreg, sxtw
4204 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4205 Overflow =
4206 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4207 } else {
4208 // tst xreg, #0xffffffff00000000
4209 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4210 Overflow =
4211 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4212 }
4213 break;
4214 }
4215 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4216 // For the 64 bit multiply
4217 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4218 if (IsSigned) {
4219 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4220 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4221 DAG.getConstant(63, DL, MVT::i64));
4222 // It is important that LowerBits is last, otherwise the arithmetic
4223 // shift will not be folded into the compare (SUBS).
4224 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4225 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4226 .getValue(1);
4227 } else {
4228 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4229 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4230 Overflow =
4231 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4232 DAG.getConstant(0, DL, MVT::i64),
4233 UpperBits).getValue(1);
4234 }
4235 break;
4236 }
4237 } // switch (...)
4238
4239 if (Opc) {
4240 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4241
4242 // Emit the AArch64 operation with overflow check.
4243 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4244 Overflow = Value.getValue(1);
4245 }
4246 return std::make_pair(Value, Overflow);
4247}
4248
4249SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4250 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4251 !Subtarget->isNeonAvailable()))
4252 return LowerToScalableOp(Op, DAG);
4253
4254 SDValue Sel = Op.getOperand(0);
4255 SDValue Other = Op.getOperand(1);
4256 SDLoc dl(Sel);
4257
4258 // If the operand is an overflow checking operation, invert the condition
4259 // code and kill the Not operation. I.e., transform:
4260 // (xor (overflow_op_bool, 1))
4261 // -->
4262 // (csel 1, 0, invert(cc), overflow_op_bool)
4263 // ... which later gets transformed to just a cset instruction with an
4264 // inverted condition code, rather than a cset + eor sequence.
4266 // Only lower legal XALUO ops.
4268 return SDValue();
4269
4270 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4271 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4273 SDValue Value, Overflow;
4274 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4275 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4276 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4277 CCVal, Overflow);
4278 }
4279 // If neither operand is a SELECT_CC, give up.
4280 if (Sel.getOpcode() != ISD::SELECT_CC)
4281 std::swap(Sel, Other);
4282 if (Sel.getOpcode() != ISD::SELECT_CC)
4283 return Op;
4284
4285 // The folding we want to perform is:
4286 // (xor x, (select_cc a, b, cc, 0, -1) )
4287 // -->
4288 // (csel x, (xor x, -1), cc ...)
4289 //
4290 // The latter will get matched to a CSINV instruction.
4291
4292 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4293 SDValue LHS = Sel.getOperand(0);
4294 SDValue RHS = Sel.getOperand(1);
4295 SDValue TVal = Sel.getOperand(2);
4296 SDValue FVal = Sel.getOperand(3);
4297
4298 // FIXME: This could be generalized to non-integer comparisons.
4299 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4300 return Op;
4301
4302 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4303 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4304
4305 // The values aren't constants, this isn't the pattern we're looking for.
4306 if (!CFVal || !CTVal)
4307 return Op;
4308
4309 // We can commute the SELECT_CC by inverting the condition. This
4310 // might be needed to make this fit into a CSINV pattern.
4311 if (CTVal->isAllOnes() && CFVal->isZero()) {
4312 std::swap(TVal, FVal);
4313 std::swap(CTVal, CFVal);
4314 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4315 }
4316
4317 // If the constants line up, perform the transform!
4318 if (CTVal->isZero() && CFVal->isAllOnes()) {
4319 SDValue CCVal;
4320 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4321
4322 FVal = Other;
4323 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4324 DAG.getAllOnesConstant(dl, Other.getValueType()));
4325
4326 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4327 CCVal, Cmp);
4328 }
4329
4330 return Op;
4331}
4332
4333// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4334// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4335// sets 'C' bit to 0.
4337 SDLoc DL(Value);
4338 EVT VT = Value.getValueType();
4339 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4340 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4341 SDValue Cmp =
4342 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4343 return Cmp.getValue(1);
4344}
4345
4346// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4347// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4349 bool Invert) {
4350 assert(Glue.getResNo() == 1);
4351 SDLoc DL(Glue);
4352 SDValue Zero = DAG.getConstant(0, DL, VT);
4353 SDValue One = DAG.getConstant(1, DL, VT);
4354 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4355 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4356 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4357}
4358
4359// Value is 1 if 'V' bit of NZCV is 1, else 0
4361 assert(Glue.getResNo() == 1);
4362 SDLoc DL(Glue);
4363 SDValue Zero = DAG.getConstant(0, DL, VT);
4364 SDValue One = DAG.getConstant(1, DL, VT);
4365 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4366 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4367}
4368
4369// This lowering is inefficient, but it will get cleaned up by
4370// `foldOverflowCheck`
4372 unsigned Opcode, bool IsSigned) {
4373 EVT VT0 = Op.getValue(0).getValueType();
4374 EVT VT1 = Op.getValue(1).getValueType();
4375
4376 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4377 return SDValue();
4378
4379 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4380 SDValue OpLHS = Op.getOperand(0);
4381 SDValue OpRHS = Op.getOperand(1);
4382 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4383
4384 SDLoc DL(Op);
4385 SDVTList VTs = DAG.getVTList(VT0, VT1);
4386
4387 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4388 OpRHS, OpCarryIn);
4389
4390 SDValue OutFlag =
4391 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4392 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4393
4394 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4395}
4396
4398 // Let legalize expand this if it isn't a legal type yet.
4399 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4400 return SDValue();
4401
4402 SDLoc dl(Op);
4404 // The actual operation that sets the overflow or carry flag.
4405 SDValue Value, Overflow;
4406 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4407
4408 // We use 0 and 1 as false and true values.
4409 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4410 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4411
4412 // We use an inverted condition, because the conditional select is inverted
4413 // too. This will allow it to be selected to a single instruction:
4414 // CSINC Wd, WZR, WZR, invert(cond).
4415 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4416 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4417 CCVal, Overflow);
4418
4419 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4420 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4421}
4422
4423// Prefetch operands are:
4424// 1: Address to prefetch
4425// 2: bool isWrite
4426// 3: int locality (0 = no locality ... 3 = extreme locality)
4427// 4: bool isDataCache
4429 SDLoc DL(Op);
4430 unsigned IsWrite = Op.getConstantOperandVal(2);
4431 unsigned Locality = Op.getConstantOperandVal(3);
4432 unsigned IsData = Op.getConstantOperandVal(4);
4433
4434 bool IsStream = !Locality;
4435 // When the locality number is set
4436 if (Locality) {
4437 // The front-end should have filtered out the out-of-range values
4438 assert(Locality <= 3 && "Prefetch locality out-of-range");
4439 // The locality degree is the opposite of the cache speed.
4440 // Put the number the other way around.
4441 // The encoding starts at 0 for level 1
4442 Locality = 3 - Locality;
4443 }
4444
4445 // built the mask value encoding the expected behavior.
4446 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4447 (!IsData << 3) | // IsDataCache bit
4448 (Locality << 1) | // Cache level bits
4449 (unsigned)IsStream; // Stream bit
4450 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4451 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4452 Op.getOperand(1));
4453}
4454
4455// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4456// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4457// (AND X Y) Z which produces a better opt with EmitComparison
4459 SelectionDAG &DAG, const SDLoc dl) {
4460 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4461 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4462 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4463 if (LHSConstOp && RHSConst) {
4464 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4465 uint64_t RHSConstant = RHSConst->getZExtValue();
4466 if (isPowerOf2_64(RHSConstant)) {
4467 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4468 LHS =
4469 DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
4470 DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
4471 RHS = DAG.getConstant(0, dl, RHS.getValueType());
4472 CC = ISD::SETEQ;
4473 }
4474 }
4475 }
4476}
4477
4478SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4479 SelectionDAG &DAG) const {
4480 EVT VT = Op.getValueType();
4481 if (VT.isScalableVector()) {
4482 SDValue SrcVal = Op.getOperand(0);
4483
4484 if (SrcVal.getValueType().getScalarType() == MVT::bf16) {
4485 // bf16 and f32 share the same exponent range so the conversion requires
4486 // them to be aligned with the new mantissa bits zero'd. This is just a
4487 // left shift that is best to isel directly.
4488 if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32)
4489 return Op;
4490
4491 if (VT != MVT::nxv2f64)
4492 return SDValue();
4493
4494 // Break other conversions in two with the first part converting to f32
4495 // and the second using native f32->VT instructions.
4496 SDLoc DL(Op);
4497 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4498 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4499 }
4500
4501 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4502 }
4503
4504 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4505 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4506
4507 bool IsStrict = Op->isStrictFPOpcode();
4508 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4509 EVT Op0VT = Op0.getValueType();
4510 if (VT == MVT::f64) {
4511 // FP16->FP32 extends are legal for v32 and v4f32.
4512 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4513 return Op;
4514 // Split bf16->f64 extends into two fpextends.
4515 if (Op0VT == MVT::bf16 && IsStrict) {
4516 SDValue Ext1 =
4517 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4518 {Op0, Op.getOperand(0)});
4519 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4520 {Ext1, Ext1.getValue(1)});
4521 }
4522 if (Op0VT == MVT::bf16)
4523 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4524 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4525 return SDValue();
4526 }
4527
4528 if (VT.getScalarType() == MVT::f32) {
4529 // FP16->FP32 extends are legal for v32 and v4f32.
4530 if (Op0VT.getScalarType() == MVT::f16)
4531 return Op;
4532 if (Op0VT.getScalarType() == MVT::bf16) {
4533 SDLoc DL(Op);
4534 EVT IVT = VT.changeTypeToInteger();
4535 if (!Op0VT.isVector()) {
4536 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
4537 IVT = MVT::v4i32;
4538 }
4539
4540 EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
4541 SDValue Ext =
4542 DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
4543 SDValue Shift =
4544 DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
4545 if (!Op0VT.isVector())
4546 Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
4547 DAG.getConstant(0, DL, MVT::i64));
4548 Shift = DAG.getBitcast(VT, Shift);
4549 return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
4550 : Shift;
4551 }
4552 return SDValue();
4553 }
4554
4555 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4556 return SDValue();
4557}
4558
4559SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4560 SelectionDAG &DAG) const {
4561 EVT VT = Op.getValueType();
4562 bool IsStrict = Op->isStrictFPOpcode();
4563 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4564 EVT SrcVT = SrcVal.getValueType();
4565 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4566
4567 if (VT.isScalableVector()) {
4568 if (VT.getScalarType() != MVT::bf16)
4569 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4570
4571 SDLoc DL(Op);
4572 constexpr EVT I32 = MVT::nxv4i32;
4573 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4574
4575 SDValue NaN;
4576 SDValue Narrow;
4577
4578 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4579 if (Subtarget->hasBF16())
4580 return LowerToPredicatedOp(Op, DAG,
4582
4583 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4584
4585 // Set the quiet bit.
4586 if (!DAG.isKnownNeverSNaN(SrcVal))
4587 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4588 } else if (SrcVT == MVT::nxv2f64 &&
4589 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4590 // Round to float without introducing rounding errors and try again.
4591 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4592 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4593 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4594
4596 if (IsStrict)
4597 NewOps.push_back(Op.getOperand(0));
4598 NewOps.push_back(Narrow);
4599 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4600 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4601 } else
4602 return SDValue();
4603
4604 if (!Trunc) {
4605 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4606 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4607 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4608 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4609 }
4610
4611 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4612 // 0x80000000.
4613 if (NaN) {
4614 EVT I1 = I32.changeElementType(MVT::i1);
4615 EVT CondVT = VT.changeElementType(MVT::i1);
4616 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4617 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4618 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4619 }
4620
4621 // Now that we have rounded, shift the bits into position.
4622 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4623 return getSVESafeBitCast(VT, Narrow, DAG);
4624 }
4625
4626 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4627 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4628
4629 // Expand cases where the result type is BF16 but we don't have hardware
4630 // instructions to lower it.
4631 if (VT.getScalarType() == MVT::bf16 &&
4632 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4633 Subtarget->hasBF16())) {
4634 SDLoc dl(Op);
4635 SDValue Narrow = SrcVal;
4636 SDValue NaN;
4637 EVT I32 = SrcVT.changeElementType(MVT::i32);
4638 EVT F32 = SrcVT.changeElementType(MVT::f32);
4639 if (SrcVT.getScalarType() == MVT::f32) {
4640 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4641 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4642 if (!NeverSNaN) {
4643 // Set the quiet bit.
4644 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4645 DAG.getConstant(0x400000, dl, I32));
4646 }
4647 } else if (SrcVT.getScalarType() == MVT::f64) {
4648 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4649 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4650 } else {
4651 return SDValue();
4652 }
4653 if (!Trunc) {
4654 SDValue One = DAG.getConstant(1, dl, I32);
4655 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4656 DAG.getShiftAmountConstant(16, I32, dl));
4657 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4658 SDValue RoundingBias =
4659 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4660 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4661 }
4662
4663 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4664 // 0x80000000.
4665 if (NaN) {
4666 SDValue IsNaN = DAG.getSetCC(
4667 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4668 SrcVal, SrcVal, ISD::SETUO);
4669 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4670 }
4671
4672 // Now that we have rounded, shift the bits into position.
4673 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4674 DAG.getShiftAmountConstant(16, I32, dl));
4675 if (VT.isVector()) {
4676 EVT I16 = I32.changeVectorElementType(MVT::i16);
4677 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4678 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4679 }
4680 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4681 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4682 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4683 : Result;
4684 }
4685
4686 if (SrcVT != MVT::f128) {
4687 // Expand cases where the input is a vector bigger than NEON.
4689 return SDValue();
4690
4691 // It's legal except when f128 is involved
4692 return Op;
4693 }
4694
4695 return SDValue();
4696}
4697
4698SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4699 SelectionDAG &DAG) const {
4700 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4701 // Any additional optimization in this function should be recorded
4702 // in the cost tables.
4703 bool IsStrict = Op->isStrictFPOpcode();
4704 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4705 EVT VT = Op.getValueType();
4706
4707 if (VT.isScalableVector()) {
4708 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4711 return LowerToPredicatedOp(Op, DAG, Opcode);
4712 }
4713
4714 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4715 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4716 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4717
4718 unsigned NumElts = InVT.getVectorNumElements();
4719
4720 // f16 conversions are promoted to f32 when full fp16 is not supported.
4721 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4722 InVT.getVectorElementType() == MVT::bf16) {
4723 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4724 SDLoc dl(Op);
4725 if (IsStrict) {
4726 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4727 {Op.getOperand(0), Op.getOperand(1)});
4728 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4729 {Ext.getValue(1), Ext.getValue(0)});
4730 }
4731 return DAG.getNode(
4732 Op.getOpcode(), dl, Op.getValueType(),
4733 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4734 }
4735
4736 uint64_t VTSize = VT.getFixedSizeInBits();
4737 uint64_t InVTSize = InVT.getFixedSizeInBits();
4738 if (VTSize < InVTSize) {
4739 SDLoc dl(Op);
4740 if (IsStrict) {
4742 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4743 {Op.getOperand(0), Op.getOperand(1)});
4744 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4745 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4746 }
4747 SDValue Cv =
4748 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4749 Op.getOperand(0));
4750 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4751 }
4752
4753 if (VTSize > InVTSize) {
4754 SDLoc dl(Op);
4755 MVT ExtVT =
4758 if (IsStrict) {
4759 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4760 {Op.getOperand(0), Op.getOperand(1)});
4761 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4762 {Ext.getValue(1), Ext.getValue(0)});
4763 }
4764 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4765 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4766 }
4767
4768 // Use a scalar operation for conversions between single-element vectors of
4769 // the same size.
4770 if (NumElts == 1) {
4771 SDLoc dl(Op);
4772 SDValue Extract = DAG.getNode(
4774 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4775 EVT ScalarVT = VT.getScalarType();
4776 if (IsStrict)
4777 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4778 {Op.getOperand(0), Extract});
4779 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4780 }
4781
4782 // Type changing conversions are illegal.
4783 return Op;
4784}
4785
4786SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4787 SelectionDAG &DAG) const {
4788 bool IsStrict = Op->isStrictFPOpcode();
4789 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4790
4791 if (SrcVal.getValueType().isVector())
4792 return LowerVectorFP_TO_INT(Op, DAG);
4793
4794 // f16 conversions are promoted to f32 when full fp16 is not supported.
4795 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4796 SrcVal.getValueType() == MVT::bf16) {
4797 SDLoc dl(Op);
4798 if (IsStrict) {
4799 SDValue Ext =
4800 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4801 {Op.getOperand(0), SrcVal});
4802 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4803 {Ext.getValue(1), Ext.getValue(0)});
4804 }
4805 return DAG.getNode(
4806 Op.getOpcode(), dl, Op.getValueType(),
4807 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4808 }
4809
4810 if (SrcVal.getValueType() != MVT::f128) {
4811 // It's legal except when f128 is involved
4812 return Op;
4813 }
4814
4815 return SDValue();
4816}
4817
4818SDValue
4819AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4820 SelectionDAG &DAG) const {
4821 // AArch64 FP-to-int conversions saturate to the destination element size, so
4822 // we can lower common saturating conversions to simple instructions.
4823 SDValue SrcVal = Op.getOperand(0);
4824 EVT SrcVT = SrcVal.getValueType();
4825 EVT DstVT = Op.getValueType();
4826 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4827
4828 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4829 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4830 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4831 assert(SatWidth <= DstElementWidth &&
4832 "Saturation width cannot exceed result width");
4833
4834 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4835 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4836 // types, so this is hard to reach.
4837 if (DstVT.isScalableVector())
4838 return SDValue();
4839
4840 EVT SrcElementVT = SrcVT.getVectorElementType();
4841
4842 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4843 SDLoc DL(Op);
4844 SDValue SrcVal2;
4845 if ((SrcElementVT == MVT::f16 &&
4846 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4847 SrcElementVT == MVT::bf16) {
4848 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4849 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4850 // If we are extending to a v8f32, split into two v4f32 to produce legal
4851 // types.
4852 if (F32VT.getSizeInBits() > 128) {
4853 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4854 F32VT = F32VT.getHalfNumVectorElementsVT();
4855 }
4856 SrcVT = F32VT;
4857 SrcElementVT = MVT::f32;
4858 SrcElementWidth = 32;
4859 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4860 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4861 return SDValue();
4862
4863 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4864 // width and produce a fcvtzu.
4865 if (SatWidth == 64 && SrcElementWidth < 64) {
4866 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4867 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4868 SrcVT = F64VT;
4869 SrcElementVT = MVT::f64;
4870 SrcElementWidth = 64;
4871 }
4872 // Cases that we can emit directly.
4873 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4874 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4875 DAG.getValueType(DstVT.getScalarType()));
4876 if (SrcVal2) {
4877 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4878 DAG.getValueType(DstVT.getScalarType()));
4879 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4880 }
4881 return Res;
4882 }
4883
4884 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4885 // result. This is only valid if the legal cvt is larger than the saturate
4886 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4887 // (at least until sqxtn is selected).
4888 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4889 return SDValue();
4890
4891 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4892 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4893 DAG.getValueType(IntVT.getScalarType()));
4894 SDValue NativeCvt2 =
4895 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4896 DAG.getValueType(IntVT.getScalarType()))
4897 : SDValue();
4898 SDValue Sat, Sat2;
4899 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4900 SDValue MinC = DAG.getConstant(
4901 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4902 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4903 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4904 SDValue MaxC = DAG.getConstant(
4905 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4906 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4907 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4908 } else {
4909 SDValue MinC = DAG.getConstant(
4910 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4911 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4912 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4913 }
4914
4915 if (SrcVal2)
4916 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4918 Sat, Sat2);
4919
4920 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4921}
4922
4923SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4924 SelectionDAG &DAG) const {
4925 // AArch64 FP-to-int conversions saturate to the destination register size, so
4926 // we can lower common saturating conversions to simple instructions.
4927 SDValue SrcVal = Op.getOperand(0);
4928 EVT SrcVT = SrcVal.getValueType();
4929
4930 if (SrcVT.isVector())
4931 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4932
4933 EVT DstVT = Op.getValueType();
4934 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4935 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4936 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4937 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4938
4939 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4940 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4941 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4942 SrcVT = MVT::f32;
4943 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4944 SrcVT != MVT::bf16)
4945 return SDValue();
4946
4947 SDLoc DL(Op);
4948 // Cases that we can emit directly.
4949 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4950 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4951 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4952 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4953 DAG.getValueType(DstVT));
4954
4955 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4956 // result. This is only valid if the legal cvt is larger than the saturate
4957 // width.
4958 if (DstWidth < SatWidth)
4959 return SDValue();
4960
4961 SDValue NativeCvt =
4962 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4963 SDValue Sat;
4964 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4965 SDValue MinC = DAG.getConstant(
4966 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4967 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4968 SDValue MaxC = DAG.getConstant(
4969 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4970 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4971 } else {
4972 SDValue MinC = DAG.getConstant(
4973 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4974 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4975 }
4976
4977 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4978}
4979
4980SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4981 SelectionDAG &DAG) const {
4982 EVT VT = Op.getValueType();
4983 SDValue Src = Op.getOperand(0);
4984 SDLoc DL(Op);
4985
4986 assert(VT.isVector() && "Expected vector type");
4987
4988 EVT CastVT =
4989 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4990
4991 // Round the floating-point value into a floating-point register with the
4992 // current rounding mode.
4993 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4994
4995 // Truncate the rounded floating point to an integer.
4996 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4998}
4999
5000SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5001 SelectionDAG &DAG) const {
5002 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5003 // Any additional optimization in this function should be recorded
5004 // in the cost tables.
5005 bool IsStrict = Op->isStrictFPOpcode();
5006 EVT VT = Op.getValueType();
5007 SDLoc dl(Op);
5008 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5009 EVT InVT = In.getValueType();
5010 unsigned Opc = Op.getOpcode();
5011 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5012
5013 if (VT.isScalableVector()) {
5014 if (InVT.getVectorElementType() == MVT::i1) {
5015 // We can't directly extend an SVE predicate; extend it first.
5016 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5017 EVT CastVT = getPromotedVTForPredicate(InVT);
5018 In = DAG.getNode(CastOpc, dl, CastVT, In);
5019 return DAG.getNode(Opc, dl, VT, In);
5020 }
5021
5022 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5024 return LowerToPredicatedOp(Op, DAG, Opcode);
5025 }
5026
5027 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5028 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5029 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5030
5031 // Promote bf16 conversions to f32.
5032 if (VT.getVectorElementType() == MVT::bf16) {
5033 EVT F32 = VT.changeElementType(MVT::f32);
5034 if (IsStrict) {
5035 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
5036 {Op.getOperand(0), In});
5037 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5038 {Op.getValueType(), MVT::Other},
5039 {Val.getValue(1), Val.getValue(0),
5040 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5041 }
5042 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5043 DAG.getNode(Op.getOpcode(), dl, F32, In),
5044 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5045 }
5046
5047 uint64_t VTSize = VT.getFixedSizeInBits();
5048 uint64_t InVTSize = InVT.getFixedSizeInBits();
5049 if (VTSize < InVTSize) {
5050 MVT CastVT =
5052 InVT.getVectorNumElements());
5053 if (IsStrict) {
5054 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
5055 {Op.getOperand(0), In});
5056 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
5057 {In.getValue(1), In.getValue(0),
5058 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5059 }
5060 In = DAG.getNode(Opc, dl, CastVT, In);
5061 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
5062 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5063 }
5064
5065 if (VTSize > InVTSize) {
5066 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5068 In = DAG.getNode(CastOpc, dl, CastVT, In);
5069 if (IsStrict)
5070 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
5071 return DAG.getNode(Opc, dl, VT, In);
5072 }
5073
5074 // Use a scalar operation for conversions between single-element vectors of
5075 // the same size.
5076 if (VT.getVectorNumElements() == 1) {
5077 SDValue Extract = DAG.getNode(
5079 In, DAG.getConstant(0, dl, MVT::i64));
5080 EVT ScalarVT = VT.getScalarType();
5081 if (IsStrict)
5082 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
5083 {Op.getOperand(0), Extract});
5084 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
5085 }
5086
5087 return Op;
5088}
5089
5090SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5091 SelectionDAG &DAG) const {
5092 if (Op.getValueType().isVector())
5093 return LowerVectorINT_TO_FP(Op, DAG);
5094
5095 bool IsStrict = Op->isStrictFPOpcode();
5096 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5097
5098 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5099 Op->getOpcode() == ISD::SINT_TO_FP;
5100
5101 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5102 SDLoc dl(Op);
5103 if (IsStrict) {
5104 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
5105 {Op.getOperand(0), SrcVal});
5106 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5107 {Op.getValueType(), MVT::Other},
5108 {Val.getValue(1), Val.getValue(0),
5109 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5110 }
5111 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5112 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
5113 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5114 };
5115
5116 if (Op.getValueType() == MVT::bf16) {
5117 unsigned MaxWidth = IsSigned
5118 ? DAG.ComputeMaxSignificantBits(SrcVal)
5119 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5120 // bf16 conversions are promoted to f32 when converting from i16.
5121 if (MaxWidth <= 24) {
5122 return IntToFpViaPromotion(MVT::f32);
5123 }
5124
5125 // bf16 conversions are promoted to f64 when converting from i32.
5126 if (MaxWidth <= 53) {
5127 return IntToFpViaPromotion(MVT::f64);
5128 }
5129
5130 // We need to be careful about i64 -> bf16.
5131 // Consider an i32 22216703.
5132 // This number cannot be represented exactly as an f32 and so a itofp will
5133 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5134 // However, the correct bf16 was supposed to be 22151168.0
5135 // We need to use sticky rounding to get this correct.
5136 if (SrcVal.getValueType() == MVT::i64) {
5137 SDLoc DL(Op);
5138 // This algorithm is equivalent to the following:
5139 // uint64_t SrcHi = SrcVal & ~0xfffull;
5140 // uint64_t SrcLo = SrcVal & 0xfffull;
5141 // uint64_t Highest = SrcVal >> 53;
5142 // bool HasHighest = Highest != 0;
5143 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5144 // double Rounded = static_cast<double>(ToRound);
5145 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5146 // uint64_t HasLo = SrcLo != 0;
5147 // bool NeedsAdjustment = HasHighest & HasLo;
5148 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5149 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5150 // return static_cast<__bf16>(Adjusted);
5151 //
5152 // Essentially, what happens is that SrcVal either fits perfectly in a
5153 // double-precision value or it is too big. If it is sufficiently small,
5154 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5155 // ensure that u64 -> double has no rounding error by only using the 52
5156 // MSB of the input. The low order bits will get merged into a sticky bit
5157 // which will avoid issues incurred by double rounding.
5158
5159 // Signed conversion is more or less like so:
5160 // copysign((__bf16)abs(SrcVal), SrcVal)
5161 SDValue SignBit;
5162 if (IsSigned) {
5163 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5164 DAG.getConstant(1ull << 63, DL, MVT::i64));
5165 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5166 }
5167 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5168 DAG.getConstant(~0xfffull, DL, MVT::i64));
5169 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5170 DAG.getConstant(0xfffull, DL, MVT::i64));
5172 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5173 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5174 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5175 SDValue ToRound =
5176 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5177 SDValue Rounded =
5178 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5179 {Op.getOperand(0), ToRound})
5180 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5181
5182 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5183 if (SignBit) {
5184 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5185 }
5186
5187 SDValue HasHighest = DAG.getSetCC(
5188 DL,
5189 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5190 Highest, Zero64, ISD::SETNE);
5191
5192 SDValue HasLo = DAG.getSetCC(
5193 DL,
5194 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5195 SrcLo, Zero64, ISD::SETNE);
5196
5197 SDValue NeedsAdjustment =
5198 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5199 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5200
5201 SDValue AdjustedBits =
5202 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5203 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5204 return IsStrict
5205 ? DAG.getNode(
5207 {Op.getValueType(), MVT::Other},
5208 {Rounded.getValue(1), Adjusted,
5209 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5210 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5211 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5212 }
5213 }
5214
5215 // f16 conversions are promoted to f32 when full fp16 is not supported.
5216 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5217 return IntToFpViaPromotion(MVT::f32);
5218 }
5219
5220 // i128 conversions are libcalls.
5221 if (SrcVal.getValueType() == MVT::i128)
5222 return SDValue();
5223
5224 // Other conversions are legal, unless it's to the completely software-based
5225 // fp128.
5226 if (Op.getValueType() != MVT::f128)
5227 return Op;
5228 return SDValue();
5229}
5230
5231SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5232 SelectionDAG &DAG) const {
5233 // For iOS, we want to call an alternative entry point: __sincos_stret,
5234 // which returns the values in two S / D registers.
5235 SDLoc dl(Op);
5236 SDValue Arg = Op.getOperand(0);
5237 EVT ArgVT = Arg.getValueType();
5238 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5239
5241 ArgListEntry Entry;
5242
5243 Entry.Node = Arg;
5244 Entry.Ty = ArgTy;
5245 Entry.IsSExt = false;
5246 Entry.IsZExt = false;
5247 Args.push_back(Entry);
5248
5249 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5250 : RTLIB::SINCOS_STRET_F32;
5251 const char *LibcallName = getLibcallName(LC);
5252 SDValue Callee =
5253 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5254
5255 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5257 CLI.setDebugLoc(dl)
5258 .setChain(DAG.getEntryNode())
5259 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
5260
5261 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5262 return CallResult.first;
5263}
5264
5265static MVT getSVEContainerType(EVT ContentTy);
5266
5267SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5268 SelectionDAG &DAG) const {
5269 EVT OpVT = Op.getValueType();
5270 EVT ArgVT = Op.getOperand(0).getValueType();
5271
5273 return LowerFixedLengthBitcastToSVE(Op, DAG);
5274
5275 if (OpVT.isScalableVector()) {
5276 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5277
5278 // Handle type legalisation first.
5279 if (!isTypeLegal(ArgVT)) {
5280 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5281 "Expected int->fp bitcast!");
5282
5283 // Bitcasting between unpacked vector types of different element counts is
5284 // not a NOP because the live elements are laid out differently.
5285 // 01234567
5286 // e.g. nxv2i32 = XX??XX??
5287 // nxv4f16 = X?X?X?X?
5288 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5289 return SDValue();
5290
5291 SDValue ExtResult =
5293 Op.getOperand(0));
5294 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5295 }
5296
5297 // Bitcasts between legal types with the same element count are legal.
5298 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5299 return Op;
5300
5301 // getSVESafeBitCast does not support casting between unpacked types.
5302 if (!isPackedVectorType(OpVT, DAG))
5303 return SDValue();
5304
5305 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5306 }
5307
5308 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5309 return SDValue();
5310
5311 // Bitcasts between f16 and bf16 are legal.
5312 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5313 return Op;
5314
5315 assert(ArgVT == MVT::i16);
5316 SDLoc DL(Op);
5317
5318 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5319 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5320 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5321}
5322
5323// Returns lane if Op extracts from a two-element vector and lane is constant
5324// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5325static std::optional<uint64_t>
5327 SDNode *OpNode = Op.getNode();
5328 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5329 return std::nullopt;
5330
5331 EVT VT = OpNode->getOperand(0).getValueType();
5332 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5333 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5334 return std::nullopt;
5335
5336 return C->getZExtValue();
5337}
5338
5340 bool isSigned) {
5341 EVT VT = N.getValueType();
5342
5343 if (N.getOpcode() != ISD::BUILD_VECTOR)
5344 return false;
5345
5346 for (const SDValue &Elt : N->op_values()) {
5347 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5348 unsigned EltSize = VT.getScalarSizeInBits();
5349 unsigned HalfSize = EltSize / 2;
5350 if (isSigned) {
5351 if (!isIntN(HalfSize, C->getSExtValue()))
5352 return false;
5353 } else {
5354 if (!isUIntN(HalfSize, C->getZExtValue()))
5355 return false;
5356 }
5357 continue;
5358 }
5359 return false;
5360 }
5361
5362 return true;
5363}
5364
5366 EVT VT = N.getValueType();
5367 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5368 EVT HalfVT = EVT::getVectorVT(
5369 *DAG.getContext(),
5372 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5373}
5374
5376 return N.getOpcode() == ISD::SIGN_EXTEND ||
5377 N.getOpcode() == ISD::ANY_EXTEND ||
5378 isExtendedBUILD_VECTOR(N, DAG, true);
5379}
5380
5382 return N.getOpcode() == ISD::ZERO_EXTEND ||
5383 N.getOpcode() == ISD::ANY_EXTEND ||
5384 isExtendedBUILD_VECTOR(N, DAG, false);
5385}
5386
5388 unsigned Opcode = N.getOpcode();
5389 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5390 SDValue N0 = N.getOperand(0);
5391 SDValue N1 = N.getOperand(1);
5392 return N0->hasOneUse() && N1->hasOneUse() &&
5393 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5394 }
5395 return false;
5396}
5397
5399 unsigned Opcode = N.getOpcode();
5400 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5401 SDValue N0 = N.getOperand(0);
5402 SDValue N1 = N.getOperand(1);
5403 return N0->hasOneUse() && N1->hasOneUse() &&
5404 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5405 }
5406 return false;
5407}
5408
5409SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5410 SelectionDAG &DAG) const {
5411 // The rounding mode is in bits 23:22 of the FPSCR.
5412 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5413 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5414 // so that the shift + and get folded into a bitfield extract.
5415 SDLoc dl(Op);
5416
5417 SDValue Chain = Op.getOperand(0);
5418 SDValue FPCR_64 = DAG.getNode(
5419 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5420 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5421 Chain = FPCR_64.getValue(1);
5422 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5423 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5424 DAG.getConstant(1U << 22, dl, MVT::i32));
5425 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5426 DAG.getConstant(22, dl, MVT::i32));
5427 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5428 DAG.getConstant(3, dl, MVT::i32));
5429 return DAG.getMergeValues({AND, Chain}, dl);
5430}
5431
5432SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5433 SelectionDAG &DAG) const {
5434 SDLoc DL(Op);
5435 SDValue Chain = Op->getOperand(0);
5436 SDValue RMValue = Op->getOperand(1);
5437
5438 // The rounding mode is in bits 23:22 of the FPCR.
5439 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5440 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5441 // ((arg - 1) & 3) << 22).
5442 //
5443 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5444 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5445 // generated llvm.set.rounding to ensure this condition.
5446
5447 // Calculate new value of FPCR[23:22].
5448 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5449 DAG.getConstant(1, DL, MVT::i32));
5450 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5451 DAG.getConstant(0x3, DL, MVT::i32));
5452 RMValue =
5453 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5454 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5455 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5456
5457 // Get current value of FPCR.
5458 SDValue Ops[] = {
5459 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5460 SDValue FPCR =
5461 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5462 Chain = FPCR.getValue(1);
5463 FPCR = FPCR.getValue(0);
5464
5465 // Put new rounding mode into FPSCR[23:22].
5466 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5467 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5468 DAG.getConstant(RMMask, DL, MVT::i64));
5469 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5470 SDValue Ops2[] = {
5471 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5472 FPCR};
5473 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5474}
5475
5476SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5477 SelectionDAG &DAG) const {
5478 SDLoc DL(Op);
5479 SDValue Chain = Op->getOperand(0);
5480
5481 // Get current value of FPCR.
5482 SDValue Ops[] = {
5483 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5484 SDValue FPCR =
5485 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5486 Chain = FPCR.getValue(1);
5487 FPCR = FPCR.getValue(0);
5488
5489 // Truncate FPCR to 32 bits.
5490 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5491
5492 return DAG.getMergeValues({Result, Chain}, DL);
5493}
5494
5495SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5496 SelectionDAG &DAG) const {
5497 SDLoc DL(Op);
5498 SDValue Chain = Op->getOperand(0);
5499 SDValue Mode = Op->getOperand(1);
5500
5501 // Extend the specified value to 64 bits.
5502 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5503
5504 // Set new value of FPCR.
5505 SDValue Ops2[] = {
5506 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5507 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5508}
5509
5510SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5511 SelectionDAG &DAG) const {
5512 SDLoc DL(Op);
5513 SDValue Chain = Op->getOperand(0);
5514
5515 // Get current value of FPCR.
5516 SDValue Ops[] = {
5517 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5518 SDValue FPCR =
5519 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5520 Chain = FPCR.getValue(1);
5521 FPCR = FPCR.getValue(0);
5522
5523 // Clear bits that are not reserved.
5524 SDValue FPSCRMasked = DAG.getNode(
5525 ISD::AND, DL, MVT::i64, FPCR,
5527
5528 // Set new value of FPCR.
5529 SDValue Ops2[] = {Chain,
5530 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5531 FPSCRMasked};
5532 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5533}
5534
5535static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5536 SDLoc DL, bool &IsMLA) {
5537 bool IsN0SExt = isSignExtended(N0, DAG);
5538 bool IsN1SExt = isSignExtended(N1, DAG);
5539 if (IsN0SExt && IsN1SExt)
5540 return AArch64ISD::SMULL;
5541
5542 bool IsN0ZExt = isZeroExtended(N0, DAG);
5543 bool IsN1ZExt = isZeroExtended(N1, DAG);
5544
5545 if (IsN0ZExt && IsN1ZExt)
5546 return AArch64ISD::UMULL;
5547
5548 // Select UMULL if we can replace the other operand with an extend.
5549 EVT VT = N0.getValueType();
5550 unsigned EltSize = VT.getScalarSizeInBits();
5551 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5552 if (IsN0ZExt || IsN1ZExt) {
5553 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5554 return AArch64ISD::UMULL;
5555 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5556 DAG.MaskedValueIsZero(N1, Mask)) {
5557 // For v2i64 we look more aggresively at both operands being zero, to avoid
5558 // scalarization.
5559 return AArch64ISD::UMULL;
5560 }
5561
5562 if (IsN0SExt || IsN1SExt) {
5563 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5564 return AArch64ISD::SMULL;
5565 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5566 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5567 return AArch64ISD::SMULL;
5568 }
5569
5570 if (!IsN1SExt && !IsN1ZExt)
5571 return 0;
5572
5573 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5574 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5575 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5576 IsMLA = true;
5577 return AArch64ISD::SMULL;
5578 }
5579 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5580 IsMLA = true;
5581 return AArch64ISD::UMULL;
5582 }
5583 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5584 std::swap(N0, N1);
5585 IsMLA = true;
5586 return AArch64ISD::UMULL;
5587 }
5588 return 0;
5589}
5590
5591SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5592 EVT VT = Op.getValueType();
5593
5594 bool OverrideNEON = !Subtarget->isNeonAvailable();
5595 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5596 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5597
5598 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5599 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5600 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5601 "unexpected type for custom-lowering ISD::MUL");
5602 SDValue N0 = Op.getOperand(0);
5603 SDValue N1 = Op.getOperand(1);
5604 bool isMLA = false;
5605 EVT OVT = VT;
5606 if (VT.is64BitVector()) {
5607 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5608 isNullConstant(N0.getOperand(1)) &&
5610 isNullConstant(N1.getOperand(1))) {
5611 N0 = N0.getOperand(0);
5612 N1 = N1.getOperand(0);
5613 VT = N0.getValueType();
5614 } else {
5615 if (VT == MVT::v1i64) {
5616 if (Subtarget->hasSVE())
5617 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5618 // Fall through to expand this. It is not legal.
5619 return SDValue();
5620 } else
5621 // Other vector multiplications are legal.
5622 return Op;
5623 }
5624 }
5625
5626 SDLoc DL(Op);
5627 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5628
5629 if (!NewOpc) {
5630 if (VT.getVectorElementType() == MVT::i64) {
5631 // If SVE is available then i64 vector multiplications can also be made
5632 // legal.
5633 if (Subtarget->hasSVE())
5634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5635 // Fall through to expand this. It is not legal.
5636 return SDValue();
5637 } else
5638 // Other vector multiplications are legal.
5639 return Op;
5640 }
5641
5642 // Legalize to a S/UMULL instruction
5643 SDValue Op0;
5644 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5645 if (!isMLA) {
5646 Op0 = skipExtensionForVectorMULL(N0, DAG);
5648 Op1.getValueType().is64BitVector() &&
5649 "unexpected types for extended operands to VMULL");
5650 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5651 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5652 DAG.getConstant(0, DL, MVT::i64));
5653 }
5654 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5655 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5656 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5659 EVT Op1VT = Op1.getValueType();
5660 return DAG.getNode(
5662 DAG.getNode(N0.getOpcode(), DL, VT,
5663 DAG.getNode(NewOpc, DL, VT,
5664 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5665 DAG.getNode(NewOpc, DL, VT,
5666 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5667 DAG.getConstant(0, DL, MVT::i64));
5668}
5669
5670static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5671 int Pattern) {
5672 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5673 return DAG.getConstant(1, DL, MVT::nxv1i1);
5674 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5675 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5676}
5677
5679 bool IsSigned, bool IsEqual) {
5680 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5681 !isa<ConstantSDNode>(Op.getOperand(2)))
5682 return SDValue();
5683
5684 SDLoc dl(Op);
5685 APInt X = Op.getConstantOperandAPInt(1);
5686 APInt Y = Op.getConstantOperandAPInt(2);
5687
5688 // When the second operand is the maximum value, comparisons that include
5689 // equality can never fail and thus we can return an all active predicate.
5690 if (IsEqual)
5691 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5692 return DAG.getConstant(1, dl, Op.getValueType());
5693
5694 bool Overflow;
5695 APInt NumActiveElems =
5696 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5697
5698 if (Overflow)
5699 return SDValue();
5700
5701 if (IsEqual) {
5702 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5703 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5704 : NumActiveElems.uadd_ov(One, Overflow);
5705 if (Overflow)
5706 return SDValue();
5707 }
5708
5709 std::optional<unsigned> PredPattern =
5711 unsigned MinSVEVectorSize = std::max(
5713 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5714 if (PredPattern != std::nullopt &&
5715 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5716 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5717
5718 return SDValue();
5719}
5720
5721// Returns a safe bitcast between two scalable vector predicates, where
5722// any newly created lanes from a widening bitcast are defined as zero.
5724 SDLoc DL(Op);
5725 EVT InVT = Op.getValueType();
5726
5727 assert(InVT.getVectorElementType() == MVT::i1 &&
5728 VT.getVectorElementType() == MVT::i1 &&
5729 "Expected a predicate-to-predicate bitcast");
5731 InVT.isScalableVector() &&
5732 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5733 "Only expect to cast between legal scalable predicate types!");
5734
5735 // Return the operand if the cast isn't changing type,
5736 if (InVT == VT)
5737 return Op;
5738
5739 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5740 // than VT. This will increase the chances of removing casts that introduce
5741 // new lanes, which have to be explicitly zero'd.
5742 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5743 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5744 Op.getOperand(1).getValueType().bitsGT(VT))
5745 Op = Op.getOperand(1);
5746
5747 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5748
5749 // We only have to zero the lanes if new lanes are being defined, e.g. when
5750 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5751 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5752 // we can return here.
5753 if (InVT.bitsGT(VT))
5754 return Reinterpret;
5755
5756 // Check if the other lanes are already known to be zeroed by
5757 // construction.
5759 return Reinterpret;
5760
5761 // Zero the newly introduced lanes.
5762 SDValue Mask = DAG.getConstant(1, DL, InVT);
5763 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5764 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5765}
5766
5767SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5768 SDValue Chain, SDLoc DL,
5769 EVT VT) const {
5770 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5772 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5773 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5776 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5778 RetTy, Callee, std::move(Args));
5779 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5780 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5781 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5782 Mask);
5783}
5784
5785// Lower an SME LDR/STR ZA intrinsic
5786// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5787// folded into the instruction
5788// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5789// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5790// and tile slice registers
5791// ldr(%tileslice, %ptr, %vecnum)
5792// ->
5793// %svl = rdsvl
5794// %ptr2 = %ptr + %svl * %vecnum
5795// %tileslice2 = %tileslice + %vecnum
5796// ldr [%tileslice2, 0], [%ptr2, 0]
5797// Case 3: If the vecnum is an immediate out of range, then the same is done as
5798// case 2, but the base and slice registers are modified by the greatest
5799// multiple of 15 lower than the vecnum and the remainder is folded into the
5800// instruction. This means that successive loads and stores that are offset from
5801// each other can share the same base and slice register updates.
5802// ldr(%tileslice, %ptr, 22)
5803// ldr(%tileslice, %ptr, 23)
5804// ->
5805// %svl = rdsvl
5806// %ptr2 = %ptr + %svl * 15
5807// %tileslice2 = %tileslice + 15
5808// ldr [%tileslice2, 7], [%ptr2, 7]
5809// ldr [%tileslice2, 8], [%ptr2, 8]
5810// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5811// operand and the immediate can be folded into the instruction, like case 2.
5812// ldr(%tileslice, %ptr, %vecnum + 7)
5813// ldr(%tileslice, %ptr, %vecnum + 8)
5814// ->
5815// %svl = rdsvl
5816// %ptr2 = %ptr + %svl * %vecnum
5817// %tileslice2 = %tileslice + %vecnum
5818// ldr [%tileslice2, 7], [%ptr2, 7]
5819// ldr [%tileslice2, 8], [%ptr2, 8]
5820// Case 5: The vecnum being an add of an immediate out of range is also handled,
5821// in which case the same remainder logic as case 3 is used.
5823 SDLoc DL(N);
5824
5825 SDValue TileSlice = N->getOperand(2);
5826 SDValue Base = N->getOperand(3);
5827 SDValue VecNum = N->getOperand(4);
5828 int32_t ConstAddend = 0;
5829 SDValue VarAddend = VecNum;
5830
5831 // If the vnum is an add of an immediate, we can fold it into the instruction
5832 if (VecNum.getOpcode() == ISD::ADD &&
5833 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5834 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5835 VarAddend = VecNum.getOperand(0);
5836 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5837 ConstAddend = ImmNode->getSExtValue();
5838 VarAddend = SDValue();
5839 }
5840
5841 int32_t ImmAddend = ConstAddend % 16;
5842 if (int32_t C = (ConstAddend - ImmAddend)) {
5843 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5844 VarAddend = VarAddend
5845 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5846 : CVal;
5847 }
5848
5849 if (VarAddend) {
5850 // Get the vector length that will be multiplied by vnum
5851 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5852 DAG.getConstant(1, DL, MVT::i32));
5853
5854 // Multiply SVL and vnum then add it to the base
5855 SDValue Mul = DAG.getNode(
5856 ISD::MUL, DL, MVT::i64,
5857 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5858 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5859 // Just add vnum to the tileslice
5860 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5861 }
5862
5864 DL, MVT::Other,
5865 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5866 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5867}
5868
5870 SDLoc dl(Op);
5871 SDValue ID =
5872 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
5873
5874 auto Op1 = Op.getOperand(1);
5875 auto Op2 = Op.getOperand(2);
5876 auto Mask = Op.getOperand(3);
5877
5878 EVT Op1VT = Op1.getValueType();
5879 EVT Op2VT = Op2.getValueType();
5880 EVT ResVT = Op.getValueType();
5881
5882 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5883 Op1VT.getVectorElementType() == MVT::i16) &&
5884 "Expected 8-bit or 16-bit characters.");
5885
5886 // Scalable vector type used to wrap operands.
5887 // A single container is enough for both operands because ultimately the
5888 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5889 EVT OpContainerVT = Op1VT.isScalableVector()
5890 ? Op1VT
5892
5893 if (Op2VT.is128BitVector()) {
5894 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5895 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
5896 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5897 if (ResVT.isScalableVector())
5898 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
5899 DAG.getTargetConstant(0, dl, MVT::i64));
5900 } else {
5901 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5902 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5903 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
5904 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
5905 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
5906 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op2IntVT, Op2,
5907 DAG.getConstant(0, dl, MVT::i64));
5908 Op2 = DAG.getSplatVector(Op2PromotedVT, dl, Op2);
5909 Op2 = DAG.getBitcast(OpContainerVT, Op2);
5910 }
5911
5912 // If the result is scalable, we just need to carry out the MATCH.
5913 if (ResVT.isScalableVector())
5914 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1, Op2);
5915
5916 // If the result is fixed, we can still use MATCH but we need to wrap the
5917 // first operand and the mask in scalable vectors before doing so.
5918
5919 // Wrap the operands.
5920 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
5921 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, Op1VT, Mask);
5922 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5923
5924 // Carry out the match.
5925 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.getValueType(),
5926 ID, Mask, Op1, Op2);
5927
5928 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5929 // (v16i8/v8i8).
5930 Match = DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match);
5931 Match = convertFromScalableVector(DAG, Op1VT, Match);
5932 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Match);
5933}
5934
5935SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5936 SelectionDAG &DAG) const {
5937 unsigned IntNo = Op.getConstantOperandVal(1);
5938 SDLoc DL(Op);
5939 switch (IntNo) {
5940 default:
5941 return SDValue(); // Don't custom lower most intrinsics.
5942 case Intrinsic::aarch64_prefetch: {
5943 SDValue Chain = Op.getOperand(0);
5944 SDValue Addr = Op.getOperand(2);
5945
5946 unsigned IsWrite = Op.getConstantOperandVal(3);
5947 unsigned Locality = Op.getConstantOperandVal(4);
5948 unsigned IsStream = Op.getConstantOperandVal(5);
5949 unsigned IsData = Op.getConstantOperandVal(6);
5950 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5951 (!IsData << 3) | // IsDataCache bit
5952 (Locality << 1) | // Cache level bits
5953 (unsigned)IsStream; // Stream bit
5954
5955 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5956 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5957 }
5958 case Intrinsic::aarch64_sme_str:
5959 case Intrinsic::aarch64_sme_ldr: {
5960 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5961 }
5962 case Intrinsic::aarch64_sme_za_enable:
5963 return DAG.getNode(
5964 AArch64ISD::SMSTART, DL, MVT::Other,
5965 Op->getOperand(0), // Chain
5966 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5967 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5968 case Intrinsic::aarch64_sme_za_disable:
5969 return DAG.getNode(
5970 AArch64ISD::SMSTOP, DL, MVT::Other,
5971 Op->getOperand(0), // Chain
5972 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5973 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5974 }
5975}
5976
5977SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5978 SelectionDAG &DAG) const {
5979 unsigned IntNo = Op.getConstantOperandVal(1);
5980 SDLoc DL(Op);
5981 switch (IntNo) {
5982 default:
5983 return SDValue(); // Don't custom lower most intrinsics.
5984 case Intrinsic::aarch64_mops_memset_tag: {
5985 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5986 SDValue Chain = Node->getChain();
5987 SDValue Dst = Op.getOperand(2);
5988 SDValue Val = Op.getOperand(3);
5989 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5990 SDValue Size = Op.getOperand(4);
5991 auto Alignment = Node->getMemOperand()->getAlign();
5992 bool IsVol = Node->isVolatile();
5993 auto DstPtrInfo = Node->getPointerInfo();
5994
5995 const auto &SDI =
5996 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5997 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
5998 Chain, Dst, Val, Size, Alignment, IsVol,
5999 DstPtrInfo, MachinePointerInfo{});
6000
6001 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6002 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6003 // LowerOperationWrapper will complain that the number of results has
6004 // changed.
6005 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6006 }
6007 }
6008}
6009
6010SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6011 SelectionDAG &DAG) const {
6012 unsigned IntNo = Op.getConstantOperandVal(0);
6013 SDLoc dl(Op);
6014 switch (IntNo) {
6015 default: return SDValue(); // Don't custom lower most intrinsics.
6016 case Intrinsic::thread_pointer: {
6017 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6018 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
6019 }
6020 case Intrinsic::aarch64_neon_abs: {
6021 EVT Ty = Op.getValueType();
6022 if (Ty == MVT::i64) {
6023 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
6024 Op.getOperand(1));
6025 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
6026 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
6027 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6028 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
6029 } else {
6030 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
6031 }
6032 }
6033 case Intrinsic::aarch64_neon_pmull64: {
6034 SDValue LHS = Op.getOperand(1);
6035 SDValue RHS = Op.getOperand(2);
6036
6037 std::optional<uint64_t> LHSLane =
6039 std::optional<uint64_t> RHSLane =
6041
6042 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6043 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6044
6045 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6046 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6047 // which ISel recognizes better. For example, generate a ldr into d*
6048 // registers as opposed to a GPR load followed by a fmov.
6049 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6050 std::optional<uint64_t> OtherLane,
6051 const SDLoc &dl,
6052 SelectionDAG &DAG) -> SDValue {
6053 // If the operand is an higher half itself, rewrite it to
6054 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6055 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6056 if (NLane && *NLane == 1)
6057 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6058 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
6059
6060 // Operand N is not a higher half but the other operand is.
6061 if (OtherLane && *OtherLane == 1) {
6062 // If this operand is a lower half, rewrite it to
6063 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6064 // align lanes of two operands. A roundtrip sequence (to move from lane
6065 // 1 to lane 0) is like this:
6066 // mov x8, v0.d[1]
6067 // fmov d0, x8
6068 if (NLane && *NLane == 0)
6069 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6070 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
6071 N.getOperand(0),
6072 DAG.getConstant(0, dl, MVT::i64)),
6073 DAG.getConstant(1, dl, MVT::i64));
6074
6075 // Otherwise just dup from main to all lanes.
6076 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
6077 }
6078
6079 // Neither operand is an extract of higher half, so codegen may just use
6080 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6081 assert(N.getValueType() == MVT::i64 &&
6082 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6083 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
6084 };
6085
6086 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
6087 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
6088
6089 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
6090 }
6091 case Intrinsic::aarch64_neon_smax:
6092 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
6093 Op.getOperand(1), Op.getOperand(2));
6094 case Intrinsic::aarch64_neon_umax:
6095 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
6096 Op.getOperand(1), Op.getOperand(2));
6097 case Intrinsic::aarch64_neon_smin:
6098 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
6099 Op.getOperand(1), Op.getOperand(2));
6100 case Intrinsic::aarch64_neon_umin:
6101 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
6102 Op.getOperand(1), Op.getOperand(2));
6103 case Intrinsic::aarch64_neon_scalar_sqxtn:
6104 case Intrinsic::aarch64_neon_scalar_sqxtun:
6105 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6106 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6107 if (Op.getValueType() == MVT::i32)
6108 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6109 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
6110 Op.getOperand(0),
6111 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
6112 Op.getOperand(1))));
6113 return SDValue();
6114 }
6115 case Intrinsic::aarch64_neon_sqxtn:
6116 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6117 Op.getOperand(1));
6118 case Intrinsic::aarch64_neon_sqxtun:
6119 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6120 Op.getOperand(1));
6121 case Intrinsic::aarch64_neon_uqxtn:
6122 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6123 Op.getOperand(1));
6124 case Intrinsic::aarch64_neon_sqshrn:
6125 if (Op.getValueType().isVector())
6126 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6127 DAG.getNode(AArch64ISD::VASHR, dl,
6128 Op.getOperand(1).getValueType(),
6129 Op.getOperand(1), Op.getOperand(2)));
6130 return SDValue();
6131 case Intrinsic::aarch64_neon_sqshrun:
6132 if (Op.getValueType().isVector())
6133 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6134 DAG.getNode(AArch64ISD::VASHR, dl,
6135 Op.getOperand(1).getValueType(),
6136 Op.getOperand(1), Op.getOperand(2)));
6137 return SDValue();
6138 case Intrinsic::aarch64_neon_uqshrn:
6139 if (Op.getValueType().isVector())
6140 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6141 DAG.getNode(AArch64ISD::VLSHR, dl,
6142 Op.getOperand(1).getValueType(),
6143 Op.getOperand(1), Op.getOperand(2)));
6144 return SDValue();
6145 case Intrinsic::aarch64_neon_sqrshrn:
6146 if (Op.getValueType().isVector())
6147 return DAG.getNode(
6148 ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6149 DAG.getNode(
6150 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6151 Op.getOperand(1), Op.getOperand(2)));
6152 return SDValue();
6153 case Intrinsic::aarch64_neon_sqrshrun:
6154 if (Op.getValueType().isVector())
6155 return DAG.getNode(
6156 ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6157 DAG.getNode(
6158 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6159 Op.getOperand(1), Op.getOperand(2)));
6160 return SDValue();
6161 case Intrinsic::aarch64_neon_uqrshrn:
6162 if (Op.getValueType().isVector())
6163 return DAG.getNode(
6164 ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6165 DAG.getNode(
6166 AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
6167 return SDValue();
6168 case Intrinsic::aarch64_sve_whilelo:
6169 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6170 /*IsEqual=*/false);
6171 case Intrinsic::aarch64_sve_whilelt:
6172 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6173 /*IsEqual=*/false);
6174 case Intrinsic::aarch64_sve_whilels:
6175 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6176 /*IsEqual=*/true);
6177 case Intrinsic::aarch64_sve_whilele:
6178 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6179 /*IsEqual=*/true);
6180 case Intrinsic::aarch64_sve_sunpkhi:
6181 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
6182 Op.getOperand(1));
6183 case Intrinsic::aarch64_sve_sunpklo:
6184 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
6185 Op.getOperand(1));
6186 case Intrinsic::aarch64_sve_uunpkhi:
6187 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
6188 Op.getOperand(1));
6189 case Intrinsic::aarch64_sve_uunpklo:
6190 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
6191 Op.getOperand(1));
6192 case Intrinsic::aarch64_sve_clasta_n:
6193 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
6194 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6195 case Intrinsic::aarch64_sve_clastb_n:
6196 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
6197 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6198 case Intrinsic::aarch64_sve_lasta:
6199 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
6200 Op.getOperand(1), Op.getOperand(2));
6201 case Intrinsic::aarch64_sve_lastb:
6202 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
6203 Op.getOperand(1), Op.getOperand(2));
6204 case Intrinsic::aarch64_sve_rev:
6205 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
6206 Op.getOperand(1));
6207 case Intrinsic::aarch64_sve_tbl:
6208 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
6209 Op.getOperand(1), Op.getOperand(2));
6210 case Intrinsic::aarch64_sve_trn1:
6211 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
6212 Op.getOperand(1), Op.getOperand(2));
6213 case Intrinsic::aarch64_sve_trn2:
6214 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
6215 Op.getOperand(1), Op.getOperand(2));
6216 case Intrinsic::aarch64_sve_uzp1:
6217 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
6218 Op.getOperand(1), Op.getOperand(2));
6219 case Intrinsic::aarch64_sve_uzp2:
6220 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
6221 Op.getOperand(1), Op.getOperand(2));
6222 case Intrinsic::aarch64_sve_zip1:
6223 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
6224 Op.getOperand(1), Op.getOperand(2));
6225 case Intrinsic::aarch64_sve_zip2:
6226 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
6227 Op.getOperand(1), Op.getOperand(2));
6228 case Intrinsic::aarch64_sve_splice:
6229 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
6230 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6231 case Intrinsic::aarch64_sve_ptrue:
6232 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
6233 case Intrinsic::aarch64_sve_clz:
6234 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
6235 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6236 case Intrinsic::aarch64_sme_cntsb:
6237 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6238 DAG.getConstant(1, dl, MVT::i32));
6239 case Intrinsic::aarch64_sme_cntsh: {
6240 SDValue One = DAG.getConstant(1, dl, MVT::i32);
6241 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
6242 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
6243 }
6244 case Intrinsic::aarch64_sme_cntsw: {
6245 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6246 DAG.getConstant(1, dl, MVT::i32));
6247 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6248 DAG.getConstant(2, dl, MVT::i32));
6249 }
6250 case Intrinsic::aarch64_sme_cntsd: {
6251 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6252 DAG.getConstant(1, dl, MVT::i32));
6253 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6254 DAG.getConstant(3, dl, MVT::i32));
6255 }
6256 case Intrinsic::aarch64_sve_cnt: {
6257 SDValue Data = Op.getOperand(3);
6258 // CTPOP only supports integer operands.
6259 if (Data.getValueType().isFloatingPoint())
6260 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
6261 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
6262 Op.getOperand(2), Data, Op.getOperand(1));
6263 }
6264 case Intrinsic::aarch64_sve_dupq_lane:
6265 return LowerDUPQLane(Op, DAG);
6266 case Intrinsic::aarch64_sve_convert_from_svbool:
6267 if (Op.getValueType() == MVT::aarch64svcount)
6268 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
6269 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6270 case Intrinsic::aarch64_sve_convert_to_svbool:
6271 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6272 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
6273 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6274 case Intrinsic::aarch64_sve_fneg:
6275 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6276 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6277 case Intrinsic::aarch64_sve_frintp:
6278 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
6279 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6280 case Intrinsic::aarch64_sve_frintm:
6281 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
6282 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6283 case Intrinsic::aarch64_sve_frinti:
6284 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6285 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6286 case Intrinsic::aarch64_sve_frintx:
6287 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6288 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6289 case Intrinsic::aarch64_sve_frinta:
6290 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
6291 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6292 case Intrinsic::aarch64_sve_frintn:
6293 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
6294 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6295 case Intrinsic::aarch64_sve_frintz:
6296 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
6297 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6298 case Intrinsic::aarch64_sve_ucvtf:
6300 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6301 Op.getOperand(1));
6302 case Intrinsic::aarch64_sve_scvtf:
6304 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6305 Op.getOperand(1));
6306 case Intrinsic::aarch64_sve_fcvtzu:
6308 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6309 Op.getOperand(1));
6310 case Intrinsic::aarch64_sve_fcvtzs:
6312 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6313 Op.getOperand(1));
6314 case Intrinsic::aarch64_sve_fsqrt:
6315 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
6316 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6317 case Intrinsic::aarch64_sve_frecpx:
6318 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
6319 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6320 case Intrinsic::aarch64_sve_frecpe_x:
6321 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
6322 Op.getOperand(1));
6323 case Intrinsic::aarch64_sve_frecps_x:
6324 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
6325 Op.getOperand(1), Op.getOperand(2));
6326 case Intrinsic::aarch64_sve_frsqrte_x:
6327 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
6328 Op.getOperand(1));
6329 case Intrinsic::aarch64_sve_frsqrts_x:
6330 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
6331 Op.getOperand(1), Op.getOperand(2));
6332 case Intrinsic::aarch64_sve_fabs:
6333 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6334 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6335 case Intrinsic::aarch64_sve_abs:
6336 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6337 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6338 case Intrinsic::aarch64_sve_neg:
6339 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6340 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6341 case Intrinsic::aarch64_sve_insr: {
6342 SDValue Scalar = Op.getOperand(2);
6343 EVT ScalarTy = Scalar.getValueType();
6344 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6345 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
6346
6347 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
6348 Op.getOperand(1), Scalar);
6349 }
6350 case Intrinsic::aarch64_sve_rbit:
6352 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6353 Op.getOperand(1));
6354 case Intrinsic::aarch64_sve_revb:
6355 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
6356 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6357 case Intrinsic::aarch64_sve_revh:
6358 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
6359 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6360 case Intrinsic::aarch64_sve_revw:
6361 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
6362 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6363 case Intrinsic::aarch64_sve_revd:
6364 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
6365 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6366 case Intrinsic::aarch64_sve_sxtb:
6367 return DAG.getNode(
6369 Op.getOperand(2), Op.getOperand(3),
6370 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6371 Op.getOperand(1));
6372 case Intrinsic::aarch64_sve_sxth:
6373 return DAG.getNode(
6375 Op.getOperand(2), Op.getOperand(3),
6376 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6377 Op.getOperand(1));
6378 case Intrinsic::aarch64_sve_sxtw:
6379 return DAG.getNode(
6381 Op.getOperand(2), Op.getOperand(3),
6382 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6383 Op.getOperand(1));
6384 case Intrinsic::aarch64_sve_uxtb:
6385 return DAG.getNode(
6387 Op.getOperand(2), Op.getOperand(3),
6388 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6389 Op.getOperand(1));
6390 case Intrinsic::aarch64_sve_uxth:
6391 return DAG.getNode(
6393 Op.getOperand(2), Op.getOperand(3),
6394 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6395 Op.getOperand(1));
6396 case Intrinsic::aarch64_sve_uxtw:
6397 return DAG.getNode(
6399 Op.getOperand(2), Op.getOperand(3),
6400 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6401 Op.getOperand(1));
6402 case Intrinsic::localaddress: {
6403 const auto &MF = DAG.getMachineFunction();
6404 const auto *RegInfo = Subtarget->getRegisterInfo();
6405 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6406 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
6407 Op.getSimpleValueType());
6408 }
6409
6410 case Intrinsic::eh_recoverfp: {
6411 // FIXME: This needs to be implemented to correctly handle highly aligned
6412 // stack objects. For now we simply return the incoming FP. Refer D53541
6413 // for more details.
6414 SDValue FnOp = Op.getOperand(1);
6415 SDValue IncomingFPOp = Op.getOperand(2);
6416 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6417 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6418 if (!Fn)
6420 "llvm.eh.recoverfp must take a function as the first argument");
6421 return IncomingFPOp;
6422 }
6423
6424 case Intrinsic::aarch64_neon_vsri:
6425 case Intrinsic::aarch64_neon_vsli:
6426 case Intrinsic::aarch64_sve_sri:
6427 case Intrinsic::aarch64_sve_sli: {
6428 EVT Ty = Op.getValueType();
6429
6430 if (!Ty.isVector())
6431 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6432
6433 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6434
6435 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6436 IntNo == Intrinsic::aarch64_sve_sri;
6437 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6438 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
6439 Op.getOperand(3));
6440 }
6441
6442 case Intrinsic::aarch64_neon_srhadd:
6443 case Intrinsic::aarch64_neon_urhadd:
6444 case Intrinsic::aarch64_neon_shadd:
6445 case Intrinsic::aarch64_neon_uhadd: {
6446 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6447 IntNo == Intrinsic::aarch64_neon_shadd);
6448 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6449 IntNo == Intrinsic::aarch64_neon_urhadd);
6450 unsigned Opcode = IsSignedAdd
6451 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6452 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6453 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6454 Op.getOperand(2));
6455 }
6456 case Intrinsic::aarch64_neon_saddlp:
6457 case Intrinsic::aarch64_neon_uaddlp: {
6458 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6461 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
6462 }
6463 case Intrinsic::aarch64_neon_sdot:
6464 case Intrinsic::aarch64_neon_udot:
6465 case Intrinsic::aarch64_sve_sdot:
6466 case Intrinsic::aarch64_sve_udot: {
6467 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6468 IntNo == Intrinsic::aarch64_sve_udot)
6471 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6472 Op.getOperand(2), Op.getOperand(3));
6473 }
6474 case Intrinsic::aarch64_neon_usdot:
6475 case Intrinsic::aarch64_sve_usdot: {
6476 return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
6477 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6478 }
6479 case Intrinsic::get_active_lane_mask: {
6480 SDValue ID =
6481 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6482
6483 EVT VT = Op.getValueType();
6484 if (VT.isScalableVector())
6485 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6486 Op.getOperand(2));
6487
6488 // We can use the SVE whilelo instruction to lower this intrinsic by
6489 // creating the appropriate sequence of scalable vector operations and
6490 // then extracting a fixed-width subvector from the scalable vector.
6491
6492 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6493 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6494
6495 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6496 Op.getOperand(1), Op.getOperand(2));
6497 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6498 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6499 DAG.getVectorIdxConstant(0, dl));
6500 }
6501 case Intrinsic::aarch64_neon_saddlv:
6502 case Intrinsic::aarch64_neon_uaddlv: {
6503 EVT OpVT = Op.getOperand(1).getValueType();
6504 EVT ResVT = Op.getValueType();
6505 assert(
6506 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6507 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6508 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6509 "Unexpected aarch64_neon_u/saddlv type");
6510 (void)OpVT;
6511 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6512 SDValue ADDLV = DAG.getNode(
6513 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6515 dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6516 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6517 ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6518 ADDLV, DAG.getConstant(0, dl, MVT::i64));
6519 return EXTRACT_VEC_ELT;
6520 }
6521 case Intrinsic::experimental_cttz_elts: {
6522 SDValue CttzOp = Op.getOperand(1);
6523 EVT VT = CttzOp.getValueType();
6524 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6525
6526 if (VT.isFixedLengthVector()) {
6527 // We can use SVE instructions to lower this intrinsic by first creating
6528 // an SVE predicate register mask from the fixed-width vector.
6529 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6530 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6531 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6532 }
6533
6534 SDValue NewCttzElts =
6535 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6536 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6537 }
6538 case Intrinsic::experimental_vector_match: {
6539 return LowerVectorMatch(Op, DAG);
6540 }
6541 }
6542}
6543
6544bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6545 if (VT.getVectorElementType() == MVT::i8 ||
6546 VT.getVectorElementType() == MVT::i16) {
6547 EltTy = MVT::i32;
6548 return true;
6549 }
6550 return false;
6551}
6552
6553bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6554 EVT DataVT) const {
6555 const EVT IndexVT = Extend.getOperand(0).getValueType();
6556 // SVE only supports implicit extension of 32-bit indices.
6557 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6558 return false;
6559
6560 // Indices cannot be smaller than the main data type.
6561 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6562 return false;
6563
6564 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6565 // element container type, which would violate the previous clause.
6566 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6567}
6568
6569bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6570 EVT ExtVT = ExtVal.getValueType();
6571 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6572 return false;
6573
6574 // It may be worth creating extending masked loads if there are multiple
6575 // masked loads using the same predicate. That way we'll end up creating
6576 // extending masked loads that may then get split by the legaliser. This
6577 // results in just one set of predicate unpacks at the start, instead of
6578 // multiple sets of vector unpacks after each load.
6579 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6580 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6581 // Disable extending masked loads for fixed-width for now, since the code
6582 // quality doesn't look great.
6583 if (!ExtVT.isScalableVector())
6584 return false;
6585
6586 unsigned NumExtMaskedLoads = 0;
6587 for (auto *U : Ld->getMask()->users())
6588 if (isa<MaskedLoadSDNode>(U))
6589 NumExtMaskedLoads++;
6590
6591 if (NumExtMaskedLoads <= 1)
6592 return false;
6593 }
6594 }
6595
6596 return true;
6597}
6598
6599unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6600 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6601 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6603 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6605 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6607 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6609 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6611 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6613 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6615 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6617 };
6618 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6619 return AddrModes.find(Key)->second;
6620}
6621
6622unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6623 switch (Opcode) {
6624 default:
6625 llvm_unreachable("unimplemented opcode");
6626 return Opcode;
6641 }
6642}
6643
6644SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6645 SelectionDAG &DAG) const {
6646 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6647
6648 SDLoc DL(Op);
6649 SDValue Chain = MGT->getChain();
6650 SDValue PassThru = MGT->getPassThru();
6651 SDValue Mask = MGT->getMask();
6652 SDValue BasePtr = MGT->getBasePtr();
6653 SDValue Index = MGT->getIndex();
6654 SDValue Scale = MGT->getScale();
6655 EVT VT = Op.getValueType();
6656 EVT MemVT = MGT->getMemoryVT();
6657 ISD::LoadExtType ExtType = MGT->getExtensionType();
6658 ISD::MemIndexType IndexType = MGT->getIndexType();
6659
6660 // SVE supports zero (and so undef) passthrough values only, everything else
6661 // must be handled manually by an explicit select on the load's output.
6662 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6663 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6664 SDValue Load =
6665 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6666 MGT->getMemOperand(), IndexType, ExtType);
6667 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6668 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6669 }
6670
6671 bool IsScaled = MGT->isIndexScaled();
6672 bool IsSigned = MGT->isIndexSigned();
6673
6674 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6675 // must be calculated before hand.
6676 uint64_t ScaleVal = Scale->getAsZExtVal();
6677 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6678 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6679 EVT IndexVT = Index.getValueType();
6680 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6681 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6682 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6683
6684 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6685 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6686 MGT->getMemOperand(), IndexType, ExtType);
6687 }
6688
6689 // Lower fixed length gather to a scalable equivalent.
6690 if (VT.isFixedLengthVector()) {
6691 assert(Subtarget->useSVEForFixedLengthVectors() &&
6692 "Cannot lower when not using SVE for fixed vectors!");
6693
6694 // NOTE: Handle floating-point as if integer then bitcast the result.
6696 MemVT = MemVT.changeVectorElementTypeToInteger();
6697
6698 // Find the smallest integer fixed length vector we can use for the gather.
6699 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6700 if (DataVT.getVectorElementType() == MVT::i64 ||
6701 Index.getValueType().getVectorElementType() == MVT::i64 ||
6702 Mask.getValueType().getVectorElementType() == MVT::i64)
6703 PromotedVT = VT.changeVectorElementType(MVT::i64);
6704
6705 // Promote vector operands except for passthrough, which we know is either
6706 // undef or zero, and thus best constructed directly.
6707 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6708 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6709 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6710
6711 // A promoted result type forces the need for an extending load.
6712 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6713 ExtType = ISD::EXTLOAD;
6714
6715 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6716
6717 // Convert fixed length vector operands to scalable.
6718 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6719 Index = convertToScalableVector(DAG, ContainerVT, Index);
6721 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6722 : DAG.getConstant(0, DL, ContainerVT);
6723
6724 // Emit equivalent scalable vector gather.
6725 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6726 SDValue Load =
6727 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6728 Ops, MGT->getMemOperand(), IndexType, ExtType);
6729
6730 // Extract fixed length data then convert to the required result type.
6731 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6732 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6733 if (VT.isFloatingPoint())
6734 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6735
6736 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6737 }
6738
6739 // Everything else is legal.
6740 return Op;
6741}
6742
6743SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6744 SelectionDAG &DAG) const {
6745 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6746
6747 SDLoc DL(Op);
6748 SDValue Chain = MSC->getChain();
6749 SDValue StoreVal = MSC->getValue();
6750 SDValue Mask = MSC->getMask();
6751 SDValue BasePtr = MSC->getBasePtr();
6752 SDValue Index = MSC->getIndex();
6753 SDValue Scale = MSC->getScale();
6754 EVT VT = StoreVal.getValueType();
6755 EVT MemVT = MSC->getMemoryVT();
6756 ISD::MemIndexType IndexType = MSC->getIndexType();
6757 bool Truncating = MSC->isTruncatingStore();
6758
6759 bool IsScaled = MSC->isIndexScaled();
6760 bool IsSigned = MSC->isIndexSigned();
6761
6762 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6763 // must be calculated before hand.
6764 uint64_t ScaleVal = Scale->getAsZExtVal();
6765 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6766 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6767 EVT IndexVT = Index.getValueType();
6768 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6769 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6770 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6771
6772 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6773 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6774 MSC->getMemOperand(), IndexType, Truncating);
6775 }
6776
6777 // Lower fixed length scatter to a scalable equivalent.
6778 if (VT.isFixedLengthVector()) {
6779 assert(Subtarget->useSVEForFixedLengthVectors() &&
6780 "Cannot lower when not using SVE for fixed vectors!");
6781
6782 // Once bitcast we treat floating-point scatters as if integer.
6783 if (VT.isFloatingPoint()) {
6785 MemVT = MemVT.changeVectorElementTypeToInteger();
6786 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6787 }
6788
6789 // Find the smallest integer fixed length vector we can use for the scatter.
6790 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6791 if (VT.getVectorElementType() == MVT::i64 ||
6792 Index.getValueType().getVectorElementType() == MVT::i64 ||
6793 Mask.getValueType().getVectorElementType() == MVT::i64)
6794 PromotedVT = VT.changeVectorElementType(MVT::i64);
6795
6796 // Promote vector operands.
6797 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6798 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6799 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6800 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6801
6802 // A promoted value type forces the need for a truncating store.
6803 if (PromotedVT != VT)
6804 Truncating = true;
6805
6806 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6807
6808 // Convert fixed length vector operands to scalable.
6809 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6810 Index = convertToScalableVector(DAG, ContainerVT, Index);
6812 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6813
6814 // Emit equivalent scalable vector scatter.
6815 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6816 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6817 MSC->getMemOperand(), IndexType, Truncating);
6818 }
6819
6820 // Everything else is legal.
6821 return Op;
6822}
6823
6824SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6825 SDLoc DL(Op);
6826 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6827 assert(LoadNode && "Expected custom lowering of a masked load node");
6828 EVT VT = Op->getValueType(0);
6829
6830 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6831 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6832
6833 SDValue PassThru = LoadNode->getPassThru();
6834 SDValue Mask = LoadNode->getMask();
6835
6836 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6837 return Op;
6838
6840 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6841 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6842 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6843 LoadNode->getExtensionType());
6844
6845 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6846
6847 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6848}
6849
6850// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6852 EVT VT, EVT MemVT,
6853 SelectionDAG &DAG) {
6854 assert(VT.isVector() && "VT should be a vector type");
6855 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6856
6857 SDValue Value = ST->getValue();
6858
6859 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6860 // the word lane which represent the v4i8 subvector. It optimizes the store
6861 // to:
6862 //
6863 // xtn v0.8b, v0.8h
6864 // str s0, [x0]
6865
6866 SDValue Undef = DAG.getUNDEF(MVT::i16);
6867 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6868 {Undef, Undef, Undef, Undef});
6869
6870 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6871 Value, UndefVec);
6872 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6873
6874 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6875 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6876 Trunc, DAG.getConstant(0, DL, MVT::i64));
6877
6878 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6879 ST->getBasePtr(), ST->getMemOperand());
6880}
6881
6882// Custom lowering for any store, vector or scalar and/or default or with
6883// a truncate operations. Currently only custom lower truncate operation
6884// from vector v4i16 to v4i8 or volatile stores of i128.
6885SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6886 SelectionDAG &DAG) const {
6887 SDLoc Dl(Op);
6888 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6889 assert (StoreNode && "Can only custom lower store nodes");
6890
6891 SDValue Value = StoreNode->getValue();
6892
6893 EVT VT = Value.getValueType();
6894 EVT MemVT = StoreNode->getMemoryVT();
6895
6896 if (VT.isVector()) {
6898 VT,
6899 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6900 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6901
6902 unsigned AS = StoreNode->getAddressSpace();
6903 Align Alignment = StoreNode->getAlign();
6904 if (Alignment < MemVT.getStoreSize() &&
6905 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6906 StoreNode->getMemOperand()->getFlags(),
6907 nullptr)) {
6908 return scalarizeVectorStore(StoreNode, DAG);
6909 }
6910
6911 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6912 MemVT == MVT::v4i8) {
6913 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6914 }
6915 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6916 // the custom lowering, as there are no un-paired non-temporal stores and
6917 // legalization will break up 256 bit inputs.
6919 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6920 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6921 (MemVT.getScalarSizeInBits() == 8u ||
6922 MemVT.getScalarSizeInBits() == 16u ||
6923 MemVT.getScalarSizeInBits() == 32u ||
6924 MemVT.getScalarSizeInBits() == 64u)) {
6925 SDValue Lo =
6928 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6929 SDValue Hi =
6932 StoreNode->getValue(),
6933 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6935 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6936 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6937 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6938 return Result;
6939 }
6940 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6941 return LowerStore128(Op, DAG);
6942 } else if (MemVT == MVT::i64x8) {
6943 SDValue Value = StoreNode->getValue();
6944 assert(Value->getValueType(0) == MVT::i64x8);
6945 SDValue Chain = StoreNode->getChain();
6946 SDValue Base = StoreNode->getBasePtr();
6947 EVT PtrVT = Base.getValueType();
6948 for (unsigned i = 0; i < 8; i++) {
6949 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6950 Value, DAG.getConstant(i, Dl, MVT::i32));
6951 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6952 DAG.getConstant(i * 8, Dl, PtrVT));
6953 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6954 StoreNode->getOriginalAlign());
6955 }
6956 return Chain;
6957 }
6958
6959 return SDValue();
6960}
6961
6962/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6963SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6964 SelectionDAG &DAG) const {
6965 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6966 assert(StoreNode->getMemoryVT() == MVT::i128);
6967 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6968
6969 bool IsStoreRelease =
6971 if (StoreNode->isAtomic())
6972 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6973 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6976
6977 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6978 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6979 ? StoreNode->getOperand(1)
6980 : StoreNode->getOperand(2);
6981 SDLoc DL(Op);
6982 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6983 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6984 if (DAG.getDataLayout().isBigEndian())
6985 std::swap(StoreValue.first, StoreValue.second);
6987 Opcode, DL, DAG.getVTList(MVT::Other),
6988 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6989 StoreNode->getBasePtr()},
6990 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6991 return Result;
6992}
6993
6994SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6995 SelectionDAG &DAG) const {
6996 SDLoc DL(Op);
6997 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6998 assert(LoadNode && "Expected custom lowering of a load node");
6999
7000 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7002 SDValue Base = LoadNode->getBasePtr();
7003 SDValue Chain = LoadNode->getChain();
7004 EVT PtrVT = Base.getValueType();
7005 for (unsigned i = 0; i < 8; i++) {
7006 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7007 DAG.getConstant(i * 8, DL, PtrVT));
7008 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
7009 LoadNode->getPointerInfo(),
7010 LoadNode->getOriginalAlign());
7011 Ops.push_back(Part);
7012 Chain = SDValue(Part.getNode(), 1);
7013 }
7014 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7015 return DAG.getMergeValues({Loaded, Chain}, DL);
7016 }
7017
7018 // Custom lowering for extending v4i8 vector loads.
7019 EVT VT = Op->getValueType(0);
7020 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7021
7022 if (LoadNode->getMemoryVT() != MVT::v4i8)
7023 return SDValue();
7024
7025 // Avoid generating unaligned loads.
7026 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7027 return SDValue();
7028
7029 unsigned ExtType;
7030 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7031 ExtType = ISD::SIGN_EXTEND;
7032 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7033 LoadNode->getExtensionType() == ISD::EXTLOAD)
7034 ExtType = ISD::ZERO_EXTEND;
7035 else
7036 return SDValue();
7037
7038 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7039 LoadNode->getBasePtr(), MachinePointerInfo());
7040 SDValue Chain = Load.getValue(1);
7041 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7042 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7043 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7044 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7045 DAG.getConstant(0, DL, MVT::i64));
7046 if (VT == MVT::v4i32)
7047 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7048 return DAG.getMergeValues({Ext, Chain}, DL);
7049}
7050
7051SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7052 SelectionDAG &DAG) const {
7053 SDLoc DL(Op);
7054 SDValue Vec = Op.getOperand(0);
7055 SDValue Mask = Op.getOperand(1);
7056 SDValue Passthru = Op.getOperand(2);
7057 EVT VecVT = Vec.getValueType();
7058 EVT MaskVT = Mask.getValueType();
7059 EVT ElmtVT = VecVT.getVectorElementType();
7060 const bool IsFixedLength = VecVT.isFixedLengthVector();
7061 const bool HasPassthru = !Passthru.isUndef();
7062 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7063 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7064
7065 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7066
7067 if (!Subtarget->isSVEAvailable())
7068 return SDValue();
7069
7070 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7071 return SDValue();
7072
7073 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7074 if (MinElmts != 2 && MinElmts != 4)
7075 return SDValue();
7076
7077 // We can use the SVE register containing the NEON vector in its lowest bits.
7078 if (IsFixedLength) {
7079 EVT ScalableVecVT =
7080 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7081 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7082 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7083
7084 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7085 DAG.getUNDEF(ScalableVecVT), Vec,
7086 DAG.getConstant(0, DL, MVT::i64));
7087 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7088 DAG.getUNDEF(ScalableMaskVT), Mask,
7089 DAG.getConstant(0, DL, MVT::i64));
7091 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7092 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7093 DAG.getUNDEF(ScalableVecVT), Passthru,
7094 DAG.getConstant(0, DL, MVT::i64));
7095
7096 VecVT = Vec.getValueType();
7097 MaskVT = Mask.getValueType();
7098 }
7099
7100 // Get legal type for compact instruction
7101 EVT ContainerVT = getSVEContainerType(VecVT);
7102 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7103
7104 // Convert to i32 or i64 for smaller types, as these are the only supported
7105 // sizes for compact.
7106 if (ContainerVT != VecVT) {
7107 Vec = DAG.getBitcast(CastVT, Vec);
7108 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7109 }
7110
7111 SDValue Compressed = DAG.getNode(
7113 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7114
7115 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7116 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7117 SDValue Offset = DAG.getNode(
7118 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7119 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7120
7121 SDValue IndexMask = DAG.getNode(
7122 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7123 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7124 DAG.getConstant(0, DL, MVT::i64), Offset);
7125
7126 Compressed =
7127 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7128 }
7129
7130 // Extracting from a legal SVE type before truncating produces better code.
7131 if (IsFixedLength) {
7132 Compressed = DAG.getNode(
7134 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7135 Compressed, DAG.getConstant(0, DL, MVT::i64));
7136 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7137 VecVT = FixedVecVT;
7138 }
7139
7140 // If we changed the element type before, we need to convert it back.
7141 if (ContainerVT != VecVT) {
7142 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7143 Compressed = DAG.getBitcast(VecVT, Compressed);
7144 }
7145
7146 return Compressed;
7147}
7148
7149// Generate SUBS and CSEL for integer abs.
7150SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7151 MVT VT = Op.getSimpleValueType();
7152
7153 if (VT.isVector())
7154 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7155
7156 SDLoc DL(Op);
7157 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7158 Op.getOperand(0));
7159 // Generate SUBS & CSEL.
7160 SDValue Cmp =
7161 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
7162 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7163 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7164 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
7165 Cmp.getValue(1));
7166}
7167
7169 SDValue Chain = Op.getOperand(0);
7170 SDValue Cond = Op.getOperand(1);
7171 SDValue Dest = Op.getOperand(2);
7172
7174 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7175 SDLoc dl(Op);
7176 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
7177 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7178 Cmp);
7179 }
7180
7181 return SDValue();
7182}
7183
7184// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7185// FSHL is converted to FSHR before deciding what to do with it
7187 SDValue Shifts = Op.getOperand(2);
7188 // Check if the shift amount is a constant
7189 // If opcode is FSHL, convert it to FSHR
7190 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7191 SDLoc DL(Op);
7192 MVT VT = Op.getSimpleValueType();
7193
7194 if (Op.getOpcode() == ISD::FSHL) {
7195 unsigned int NewShiftNo =
7196 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
7197 return DAG.getNode(
7198 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7199 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7200 } else if (Op.getOpcode() == ISD::FSHR) {
7201 return Op;
7202 }
7203 }
7204
7205 return SDValue();
7206}
7207
7209 SDValue X = Op.getOperand(0);
7210 EVT XScalarTy = X.getValueType();
7211 SDValue Exp = Op.getOperand(1);
7212
7213 SDLoc DL(Op);
7214 EVT XVT, ExpVT;
7215 switch (Op.getSimpleValueType().SimpleTy) {
7216 default:
7217 return SDValue();
7218 case MVT::bf16:
7219 case MVT::f16:
7220 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7221 [[fallthrough]];
7222 case MVT::f32:
7223 XVT = MVT::nxv4f32;
7224 ExpVT = MVT::nxv4i32;
7225 break;
7226 case MVT::f64:
7227 XVT = MVT::nxv2f64;
7228 ExpVT = MVT::nxv2i64;
7229 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7230 break;
7231 }
7232
7233 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7234 SDValue VX =
7235 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7236 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7237 DAG.getUNDEF(ExpVT), Exp, Zero);
7238 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7239 AArch64SVEPredPattern::all);
7240 SDValue FScale =
7242 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7243 VPg, VX, VExp);
7244 SDValue Final =
7245 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7246 if (X.getValueType() != XScalarTy)
7247 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7248 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7249 return Final;
7250}
7251
7252SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7253 SelectionDAG &DAG) const {
7254 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7255 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7257 "ADJUST_TRAMPOLINE operation is only supported on Linux.");
7258
7259 return Op.getOperand(0);
7260}
7261
7262SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7263 SelectionDAG &DAG) const {
7264
7265 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7266 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7267 report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
7268
7269 SDValue Chain = Op.getOperand(0);
7270 SDValue Trmp = Op.getOperand(1); // trampoline
7271 SDValue FPtr = Op.getOperand(2); // nested function
7272 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7273 SDLoc dl(Op);
7274
7275 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7276 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
7277
7280
7281 Entry.Ty = IntPtrTy;
7282 Entry.Node = Trmp;
7283 Args.push_back(Entry);
7284
7285 if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
7287 MachineFrameInfo &MFI = MF.getFrameInfo();
7288 Entry.Node =
7289 DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
7290 } else
7291 Entry.Node = DAG.getConstant(36, dl, MVT::i64);
7292
7293 Args.push_back(Entry);
7294 Entry.Node = FPtr;
7295 Args.push_back(Entry);
7296 Entry.Node = Nest;
7297 Args.push_back(Entry);
7298
7299 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
7301 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
7303 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
7304
7305 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7306 return CallResult.second;
7307}
7308
7310 SelectionDAG &DAG) const {
7311 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7312 LLVM_DEBUG(Op.dump());
7313
7314 switch (Op.getOpcode()) {
7315 default:
7316 llvm_unreachable("unimplemented operand");
7317 return SDValue();
7318 case ISD::BITCAST:
7319 return LowerBITCAST(Op, DAG);
7320 case ISD::GlobalAddress:
7321 return LowerGlobalAddress(Op, DAG);
7323 return LowerGlobalTLSAddress(Op, DAG);
7325 return LowerPtrAuthGlobalAddress(Op, DAG);
7327 return LowerADJUST_TRAMPOLINE(Op, DAG);
7329 return LowerINIT_TRAMPOLINE(Op, DAG);
7330 case ISD::SETCC:
7331 case ISD::STRICT_FSETCC:
7333 return LowerSETCC(Op, DAG);
7334 case ISD::SETCCCARRY:
7335 return LowerSETCCCARRY(Op, DAG);
7336 case ISD::BRCOND:
7337 return LowerBRCOND(Op, DAG);
7338 case ISD::BR_CC:
7339 return LowerBR_CC(Op, DAG);
7340 case ISD::SELECT:
7341 return LowerSELECT(Op, DAG);
7342 case ISD::SELECT_CC:
7343 return LowerSELECT_CC(Op, DAG);
7344 case ISD::JumpTable:
7345 return LowerJumpTable(Op, DAG);
7346 case ISD::BR_JT:
7347 return LowerBR_JT(Op, DAG);
7348 case ISD::BRIND:
7349 return LowerBRIND(Op, DAG);
7350 case ISD::ConstantPool:
7351 return LowerConstantPool(Op, DAG);
7352 case ISD::BlockAddress:
7353 return LowerBlockAddress(Op, DAG);
7354 case ISD::VASTART:
7355 return LowerVASTART(Op, DAG);
7356 case ISD::VACOPY:
7357 return LowerVACOPY(Op, DAG);
7358 case ISD::VAARG:
7359 return LowerVAARG(Op, DAG);
7360 case ISD::UADDO_CARRY:
7361 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7362 case ISD::USUBO_CARRY:
7363 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7364 case ISD::SADDO_CARRY:
7365 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7366 case ISD::SSUBO_CARRY:
7367 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7368 case ISD::SADDO:
7369 case ISD::UADDO:
7370 case ISD::SSUBO:
7371 case ISD::USUBO:
7372 case ISD::SMULO:
7373 case ISD::UMULO:
7374 return LowerXALUO(Op, DAG);
7375 case ISD::FADD:
7376 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7377 case ISD::FSUB:
7378 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7379 case ISD::FMUL:
7380 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7381 case ISD::FMA:
7382 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7383 case ISD::FDIV:
7384 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7385 case ISD::FNEG:
7386 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7387 case ISD::FCEIL:
7388 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7389 case ISD::FFLOOR:
7390 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7391 case ISD::FNEARBYINT:
7392 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7393 case ISD::FRINT:
7394 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7395 case ISD::FROUND:
7396 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7397 case ISD::FROUNDEVEN:
7398 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7399 case ISD::FTRUNC:
7400 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7401 case ISD::FSQRT:
7402 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7403 case ISD::FABS:
7404 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7405 case ISD::FP_ROUND:
7407 return LowerFP_ROUND(Op, DAG);
7408 case ISD::FP_EXTEND:
7410 return LowerFP_EXTEND(Op, DAG);
7411 case ISD::FRAMEADDR:
7412 return LowerFRAMEADDR(Op, DAG);
7413 case ISD::SPONENTRY:
7414 return LowerSPONENTRY(Op, DAG);
7415 case ISD::RETURNADDR:
7416 return LowerRETURNADDR(Op, DAG);
7418 return LowerADDROFRETURNADDR(Op, DAG);
7420 return LowerCONCAT_VECTORS(Op, DAG);
7422 return LowerINSERT_VECTOR_ELT(Op, DAG);
7424 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7425 case ISD::BUILD_VECTOR:
7426 return LowerBUILD_VECTOR(Op, DAG);
7428 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7430 return LowerVECTOR_SHUFFLE(Op, DAG);
7431 case ISD::SPLAT_VECTOR:
7432 return LowerSPLAT_VECTOR(Op, DAG);
7434 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7436 return LowerINSERT_SUBVECTOR(Op, DAG);
7437 case ISD::SDIV:
7438 case ISD::UDIV:
7439 return LowerDIV(Op, DAG);
7440 case ISD::SMIN:
7441 case ISD::UMIN:
7442 case ISD::SMAX:
7443 case ISD::UMAX:
7444 return LowerMinMax(Op, DAG);
7445 case ISD::SRA:
7446 case ISD::SRL:
7447 case ISD::SHL:
7448 return LowerVectorSRA_SRL_SHL(Op, DAG);
7449 case ISD::SHL_PARTS:
7450 case ISD::SRL_PARTS:
7451 case ISD::SRA_PARTS:
7452 return LowerShiftParts(Op, DAG);
7453 case ISD::CTPOP:
7454 case ISD::PARITY:
7455 return LowerCTPOP_PARITY(Op, DAG);
7456 case ISD::FCOPYSIGN:
7457 return LowerFCOPYSIGN(Op, DAG);
7458 case ISD::OR:
7459 return LowerVectorOR(Op, DAG);
7460 case ISD::XOR:
7461 return LowerXOR(Op, DAG);
7462 case ISD::PREFETCH:
7463 return LowerPREFETCH(Op, DAG);
7464 case ISD::SINT_TO_FP:
7465 case ISD::UINT_TO_FP:
7468 return LowerINT_TO_FP(Op, DAG);
7469 case ISD::FP_TO_SINT:
7470 case ISD::FP_TO_UINT:
7473 return LowerFP_TO_INT(Op, DAG);
7476 return LowerFP_TO_INT_SAT(Op, DAG);
7477 case ISD::FSINCOS:
7478 return LowerFSINCOS(Op, DAG);
7479 case ISD::GET_ROUNDING:
7480 return LowerGET_ROUNDING(Op, DAG);
7481 case ISD::SET_ROUNDING:
7482 return LowerSET_ROUNDING(Op, DAG);
7483 case ISD::GET_FPMODE:
7484 return LowerGET_FPMODE(Op, DAG);
7485 case ISD::SET_FPMODE:
7486 return LowerSET_FPMODE(Op, DAG);
7487 case ISD::RESET_FPMODE:
7488 return LowerRESET_FPMODE(Op, DAG);
7489 case ISD::MUL:
7490 return LowerMUL(Op, DAG);
7491 case ISD::MULHS:
7492 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7493 case ISD::MULHU:
7494 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7496 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7498 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7500 return LowerINTRINSIC_VOID(Op, DAG);
7501 case ISD::ATOMIC_STORE:
7502 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7503 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7504 return LowerStore128(Op, DAG);
7505 }
7506 return SDValue();
7507 case ISD::STORE:
7508 return LowerSTORE(Op, DAG);
7509 case ISD::MSTORE:
7510 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7511 case ISD::MGATHER:
7512 return LowerMGATHER(Op, DAG);
7513 case ISD::MSCATTER:
7514 return LowerMSCATTER(Op, DAG);
7516 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7517 case ISD::VECREDUCE_ADD:
7518 case ISD::VECREDUCE_AND:
7519 case ISD::VECREDUCE_OR:
7520 case ISD::VECREDUCE_XOR:
7530 return LowerVECREDUCE(Op, DAG);
7532 return LowerATOMIC_LOAD_AND(Op, DAG);
7534 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7535 case ISD::VSCALE:
7536 return LowerVSCALE(Op, DAG);
7538 return LowerVECTOR_COMPRESS(Op, DAG);
7539 case ISD::ANY_EXTEND:
7540 case ISD::SIGN_EXTEND:
7541 case ISD::ZERO_EXTEND:
7542 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7544 // Only custom lower when ExtraVT has a legal byte based element type.
7545 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7546 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7547 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7548 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7549 return SDValue();
7550
7551 return LowerToPredicatedOp(Op, DAG,
7553 }
7554 case ISD::TRUNCATE:
7555 return LowerTRUNCATE(Op, DAG);
7556 case ISD::MLOAD:
7557 return LowerMLOAD(Op, DAG);
7558 case ISD::LOAD:
7559 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7560 !Subtarget->isNeonAvailable()))
7561 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7562 return LowerLOAD(Op, DAG);
7563 case ISD::ADD:
7564 case ISD::AND:
7565 case ISD::SUB:
7566 return LowerToScalableOp(Op, DAG);
7567 case ISD::FMAXIMUM:
7568 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7569 case ISD::FMAXNUM:
7570 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7571 case ISD::FMINIMUM:
7572 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7573 case ISD::FMINNUM:
7574 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7575 case ISD::VSELECT:
7576 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7577 case ISD::ABS:
7578 return LowerABS(Op, DAG);
7579 case ISD::ABDS:
7580 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7581 case ISD::ABDU:
7582 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7583 case ISD::AVGFLOORS:
7584 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7585 case ISD::AVGFLOORU:
7586 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7587 case ISD::AVGCEILS:
7588 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7589 case ISD::AVGCEILU:
7590 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7591 case ISD::BITREVERSE:
7592 return LowerBitreverse(Op, DAG);
7593 case ISD::BSWAP:
7594 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7595 case ISD::CTLZ:
7596 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7597 case ISD::CTTZ:
7598 return LowerCTTZ(Op, DAG);
7599 case ISD::VECTOR_SPLICE:
7600 return LowerVECTOR_SPLICE(Op, DAG);
7602 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7604 return LowerVECTOR_INTERLEAVE(Op, DAG);
7605 case ISD::LRINT:
7606 case ISD::LLRINT:
7607 if (Op.getValueType().isVector())
7608 return LowerVectorXRINT(Op, DAG);
7609 [[fallthrough]];
7610 case ISD::LROUND:
7611 case ISD::LLROUND: {
7612 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7613 Op.getOperand(0).getValueType() == MVT::bf16) &&
7614 "Expected custom lowering of rounding operations only for f16");
7615 SDLoc DL(Op);
7616 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7617 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7618 }
7619 case ISD::STRICT_LROUND:
7621 case ISD::STRICT_LRINT:
7622 case ISD::STRICT_LLRINT: {
7623 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7624 Op.getOperand(1).getValueType() == MVT::bf16) &&
7625 "Expected custom lowering of rounding operations only for f16");
7626 SDLoc DL(Op);
7627 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7628 {Op.getOperand(0), Op.getOperand(1)});
7629 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7630 {Ext.getValue(1), Ext.getValue(0)});
7631 }
7632 case ISD::WRITE_REGISTER: {
7633 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7634 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7635 SDLoc DL(Op);
7636
7637 SDValue Chain = Op.getOperand(0);
7638 SDValue SysRegName = Op.getOperand(1);
7639 std::pair<SDValue, SDValue> Pair =
7640 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7641
7642 // chain = MSRR(chain, sysregname, lo, hi)
7643 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7644 SysRegName, Pair.first, Pair.second);
7645
7646 return Result;
7647 }
7648 case ISD::FSHL:
7649 case ISD::FSHR:
7650 return LowerFunnelShift(Op, DAG);
7651 case ISD::FLDEXP:
7652 return LowerFLDEXP(Op, DAG);
7654 return LowerVECTOR_HISTOGRAM(Op, DAG);
7655 }
7656}
7657
7659 return !Subtarget->useSVEForFixedLengthVectors();
7660}
7661
7663 EVT VT, bool OverrideNEON) const {
7664 if (!VT.isFixedLengthVector() || !VT.isSimple())
7665 return false;
7666
7667 // Don't use SVE for vectors we cannot scalarize if required.
7668 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7669 // Fixed length predicates should be promoted to i8.
7670 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7671 case MVT::i1:
7672 default:
7673 return false;
7674 case MVT::i8:
7675 case MVT::i16:
7676 case MVT::i32:
7677 case MVT::i64:
7678 case MVT::f16:
7679 case MVT::f32:
7680 case MVT::f64:
7681 break;
7682 }
7683
7684 // NEON-sized vectors can be emulated using SVE instructions.
7685 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7686 return Subtarget->isSVEorStreamingSVEAvailable();
7687
7688 // Ensure NEON MVTs only belong to a single register class.
7689 if (VT.getFixedSizeInBits() <= 128)
7690 return false;
7691
7692 // Ensure wider than NEON code generation is enabled.
7693 if (!Subtarget->useSVEForFixedLengthVectors())
7694 return false;
7695
7696 // Don't use SVE for types that don't fit.
7697 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7698 return false;
7699
7700 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7701 // the base fixed length SVE support in place.
7702 if (!VT.isPow2VectorType())
7703 return false;
7704
7705 return true;
7706}
7707
7708//===----------------------------------------------------------------------===//
7709// Calling Convention Implementation
7710//===----------------------------------------------------------------------===//
7711
7712static unsigned getIntrinsicID(const SDNode *N) {
7713 unsigned Opcode = N->getOpcode();
7714 switch (Opcode) {
7715 default:
7718 unsigned IID = N->getConstantOperandVal(0);
7719 if (IID < Intrinsic::num_intrinsics)
7720 return IID;
7722 }
7723 }
7724}
7725
7727 SDValue N1) const {
7728 if (!N0.hasOneUse())
7729 return false;
7730
7731 unsigned IID = getIntrinsicID(N1.getNode());
7732 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7733 if (IID == Intrinsic::aarch64_neon_umull ||
7734 N1.getOpcode() == AArch64ISD::UMULL ||
7735 IID == Intrinsic::aarch64_neon_smull ||
7737 return N0.getOpcode() != ISD::ADD;
7738
7739 return true;
7740}
7741
7742/// Selects the correct CCAssignFn for a given CallingConvention value.
7744 bool IsVarArg) const {
7745 switch (CC) {
7746 default:
7747 report_fatal_error("Unsupported calling convention.");
7748 case CallingConv::GHC:
7749 return CC_AArch64_GHC;
7751 // The VarArg implementation makes assumptions about register
7752 // argument passing that do not hold for preserve_none, so we
7753 // instead fall back to C argument passing.
7754 // The non-vararg case is handled in the CC function itself.
7755 if (!IsVarArg)
7757 [[fallthrough]];
7758 case CallingConv::C:
7759 case CallingConv::Fast:
7763 case CallingConv::Swift:
7765 case CallingConv::Tail:
7766 case CallingConv::GRAAL:
7767 if (Subtarget->isTargetWindows()) {
7768 if (IsVarArg) {
7769 if (Subtarget->isWindowsArm64EC())
7772 }
7773 return CC_AArch64_Win64PCS;
7774 }
7775 if (!Subtarget->isTargetDarwin())
7776 return CC_AArch64_AAPCS;
7777 if (!IsVarArg)
7778 return CC_AArch64_DarwinPCS;
7781 case CallingConv::Win64:
7782 if (IsVarArg) {
7783 if (Subtarget->isWindowsArm64EC())
7786 }
7787 return CC_AArch64_Win64PCS;
7789 if (Subtarget->isWindowsArm64EC())
7797 return CC_AArch64_AAPCS;
7802 }
7803}
7804
7805CCAssignFn *
7807 switch (CC) {
7808 default:
7809 return RetCC_AArch64_AAPCS;
7813 if (Subtarget->isWindowsArm64EC())
7815 return RetCC_AArch64_AAPCS;
7816 }
7817}
7818
7819static bool isPassedInFPR(EVT VT) {
7820 return VT.isFixedLengthVector() ||
7821 (VT.isFloatingPoint() && !VT.isScalableVector());
7822}
7823
7824SDValue AArch64TargetLowering::LowerFormalArguments(
7825 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7826 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7827 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7829 const Function &F = MF.getFunction();
7830 MachineFrameInfo &MFI = MF.getFrameInfo();
7831 bool IsWin64 =
7832 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7833 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7834 (isVarArg && Subtarget->isWindowsArm64EC());
7836
7838 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7840 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7841 FuncInfo->setIsSVECC(true);
7842
7843 // Assign locations to all of the incoming arguments.
7845 DenseMap<unsigned, SDValue> CopiedRegs;
7846 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7847
7848 // At this point, Ins[].VT may already be promoted to i32. To correctly
7849 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7850 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7851 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7852 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7853 // LocVT.
7854 unsigned NumArgs = Ins.size();
7855 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7856 unsigned CurArgIdx = 0;
7857 for (unsigned i = 0; i != NumArgs; ++i) {
7858 MVT ValVT = Ins[i].VT;
7859 if (Ins[i].isOrigArg()) {
7860 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7861 CurArgIdx = Ins[i].getOrigArgIndex();
7862
7863 // Get type of the original argument.
7864 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7865 /*AllowUnknown*/ true);
7866 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7867 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7868 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7869 ValVT = MVT::i8;
7870 else if (ActualMVT == MVT::i16)
7871 ValVT = MVT::i16;
7872 }
7873 bool UseVarArgCC = false;
7874 if (IsWin64)
7875 UseVarArgCC = isVarArg;
7876 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7877 bool Res =
7878 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7879 assert(!Res && "Call operand has unhandled type");
7880 (void)Res;
7881 }
7882
7884 bool IsLocallyStreaming =
7885 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7886 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7887 SDValue Glue = Chain.getValue(1);
7888
7889 SmallVector<SDValue, 16> ArgValues;
7890 unsigned ExtraArgLocs = 0;
7891 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7892 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7893
7894 if (Ins[i].Flags.isByVal()) {
7895 // Byval is used for HFAs in the PCS, but the system should work in a
7896 // non-compliant manner for larger structs.
7897 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7898 int Size = Ins[i].Flags.getByValSize();
7899 unsigned NumRegs = (Size + 7) / 8;
7900
7901 // FIXME: This works on big-endian for composite byvals, which are the common
7902 // case. It should also work for fundamental types too.
7903 unsigned FrameIdx =
7904 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7905 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7906 InVals.push_back(FrameIdxN);
7907
7908 continue;
7909 }
7910
7911 if (Ins[i].Flags.isSwiftAsync())
7913
7914 SDValue ArgValue;
7915 if (VA.isRegLoc()) {
7916 // Arguments stored in registers.
7917 EVT RegVT = VA.getLocVT();
7918 const TargetRegisterClass *RC;
7919
7920 if (RegVT == MVT::i32)
7921 RC = &AArch64::GPR32RegClass;
7922 else if (RegVT == MVT::i64)
7923 RC = &AArch64::GPR64RegClass;
7924 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7925 RC = &AArch64::FPR16RegClass;
7926 else if (RegVT == MVT::f32)
7927 RC = &AArch64::FPR32RegClass;
7928 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7929 RC = &AArch64::FPR64RegClass;
7930 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7931 RC = &AArch64::FPR128RegClass;
7932 else if (RegVT.isScalableVector() &&
7933 RegVT.getVectorElementType() == MVT::i1) {
7934 FuncInfo->setIsSVECC(true);
7935 RC = &AArch64::PPRRegClass;
7936 } else if (RegVT == MVT::aarch64svcount) {
7937 FuncInfo->setIsSVECC(true);
7938 RC = &AArch64::PPRRegClass;
7939 } else if (RegVT.isScalableVector()) {
7940 FuncInfo->setIsSVECC(true);
7941 RC = &AArch64::ZPRRegClass;
7942 } else
7943 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7944
7945 // Transform the arguments in physical registers into virtual ones.
7946 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7947
7948 if (IsLocallyStreaming) {
7949 // LocallyStreamingFunctions must insert the SMSTART in the correct
7950 // position, so we use Glue to ensure no instructions can be scheduled
7951 // between the chain of:
7952 // t0: ch,glue = EntryNode
7953 // t1: res,ch,glue = CopyFromReg
7954 // ...
7955 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7956 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7957 // ^^^^^^
7958 // This will be the new Chain/Root node.
7959 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7960 Glue = ArgValue.getValue(2);
7961 if (isPassedInFPR(ArgValue.getValueType())) {
7962 ArgValue =
7964 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7965 {ArgValue, Glue});
7966 Glue = ArgValue.getValue(1);
7967 }
7968 } else
7969 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7970
7971 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7972 // to 64 bits. Insert an assert[sz]ext to capture this, then
7973 // truncate to the right size.
7974 switch (VA.getLocInfo()) {
7975 default:
7976 llvm_unreachable("Unknown loc info!");
7977 case CCValAssign::Full:
7978 break;
7980 assert(
7981 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7982 "Indirect arguments should be scalable on most subtargets");
7983 break;
7984 case CCValAssign::BCvt:
7985 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7986 break;
7987 case CCValAssign::AExt:
7988 case CCValAssign::SExt:
7989 case CCValAssign::ZExt:
7990 break;
7992 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7993 DAG.getConstant(32, DL, RegVT));
7994 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7995 break;
7996 }
7997 } else { // VA.isRegLoc()
7998 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7999 unsigned ArgOffset = VA.getLocMemOffset();
8000 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8001 ? VA.getLocVT().getSizeInBits()
8002 : VA.getValVT().getSizeInBits()) / 8;
8003
8004 uint32_t BEAlign = 0;
8005 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8006 !Ins[i].Flags.isInConsecutiveRegs())
8007 BEAlign = 8 - ArgSize;
8008
8009 SDValue FIN;
8010 MachinePointerInfo PtrInfo;
8011 if (StackViaX4) {
8012 // In both the ARM64EC varargs convention and the thunk convention,
8013 // arguments on the stack are accessed relative to x4, not sp. In
8014 // the thunk convention, there's an additional offset of 32 bytes
8015 // to account for the shadow store.
8016 unsigned ObjOffset = ArgOffset + BEAlign;
8017 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8018 ObjOffset += 32;
8019 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8020 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8021 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8022 DAG.getConstant(ObjOffset, DL, MVT::i64));
8024 } else {
8025 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8026
8027 // Create load nodes to retrieve arguments from the stack.
8028 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8029 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8030 }
8031
8032 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8034 MVT MemVT = VA.getValVT();
8035
8036 switch (VA.getLocInfo()) {
8037 default:
8038 break;
8039 case CCValAssign::Trunc:
8040 case CCValAssign::BCvt:
8041 MemVT = VA.getLocVT();
8042 break;
8045 Subtarget->isWindowsArm64EC()) &&
8046 "Indirect arguments should be scalable on most subtargets");
8047 MemVT = VA.getLocVT();
8048 break;
8049 case CCValAssign::SExt:
8050 ExtType = ISD::SEXTLOAD;
8051 break;
8052 case CCValAssign::ZExt:
8053 ExtType = ISD::ZEXTLOAD;
8054 break;
8055 case CCValAssign::AExt:
8056 ExtType = ISD::EXTLOAD;
8057 break;
8058 }
8059
8060 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8061 MemVT);
8062 }
8063
8064 if (VA.getLocInfo() == CCValAssign::Indirect) {
8065 assert((VA.getValVT().isScalableVT() ||
8066 Subtarget->isWindowsArm64EC()) &&
8067 "Indirect arguments should be scalable on most subtargets");
8068
8069 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8070 unsigned NumParts = 1;
8071 if (Ins[i].Flags.isInConsecutiveRegs()) {
8072 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8073 ++NumParts;
8074 }
8075
8076 MVT PartLoad = VA.getValVT();
8077 SDValue Ptr = ArgValue;
8078
8079 // Ensure we generate all loads for each tuple part, whilst updating the
8080 // pointer after each load correctly using vscale.
8081 while (NumParts > 0) {
8082 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8083 InVals.push_back(ArgValue);
8084 NumParts--;
8085 if (NumParts > 0) {
8086 SDValue BytesIncrement;
8087 if (PartLoad.isScalableVector()) {
8088 BytesIncrement = DAG.getVScale(
8089 DL, Ptr.getValueType(),
8090 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8091 } else {
8092 BytesIncrement = DAG.getConstant(
8093 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8094 Ptr.getValueType());
8095 }
8096 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8097 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8098 ExtraArgLocs++;
8099 i++;
8100 }
8101 }
8102 } else {
8103 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8104 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8105 ArgValue, DAG.getValueType(MVT::i32));
8106
8107 // i1 arguments are zero-extended to i8 by the caller. Emit a
8108 // hint to reflect this.
8109 if (Ins[i].isOrigArg()) {
8110 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8111 if (OrigArg->getType()->isIntegerTy(1)) {
8112 if (!Ins[i].Flags.isZExt()) {
8113 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8114 ArgValue.getValueType(), ArgValue);
8115 }
8116 }
8117 }
8118
8119 InVals.push_back(ArgValue);
8120 }
8121 }
8122 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8123
8124 // Insert the SMSTART if this is a locally streaming function and
8125 // make sure it is Glued to the last CopyFromReg value.
8126 if (IsLocallyStreaming) {
8127 SDValue PStateSM;
8128 if (Attrs.hasStreamingCompatibleInterface()) {
8129 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8132 FuncInfo->setPStateSMReg(Reg);
8133 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
8134 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8136 } else
8137 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8139
8140 // Ensure that the SMSTART happens after the CopyWithChain such that its
8141 // chain result is used.
8142 for (unsigned I=0; I<InVals.size(); ++I) {
8145 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8146 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8147 InVals[I].getValueType());
8148 }
8149 }
8150
8151 // varargs
8152 if (isVarArg) {
8153 if (!Subtarget->isTargetDarwin() || IsWin64) {
8154 // The AAPCS variadic function ABI is identical to the non-variadic
8155 // one. As a result there may be more arguments in registers and we should
8156 // save them for future reference.
8157 // Win64 variadic functions also pass arguments in registers, but all float
8158 // arguments are passed in integer registers.
8159 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8160 }
8161
8162 // This will point to the next argument passed via stack.
8163 unsigned VarArgsOffset = CCInfo.getStackSize();
8164 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8165 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8166 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8167 FuncInfo->setVarArgsStackIndex(
8168 MFI.CreateFixedObject(4, VarArgsOffset, true));
8169
8170 if (MFI.hasMustTailInVarArgFunc()) {
8171 SmallVector<MVT, 2> RegParmTypes;
8172 RegParmTypes.push_back(MVT::i64);
8173 RegParmTypes.push_back(MVT::f128);
8174 // Compute the set of forwarded registers. The rest are scratch.
8176 FuncInfo->getForwardedMustTailRegParms();
8177 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8179
8180 // Conservatively forward X8, since it might be used for aggregate return.
8181 if (!CCInfo.isAllocated(AArch64::X8)) {
8182 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8183 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8184 }
8185 }
8186 }
8187
8188 // On Windows, InReg pointers must be returned, so record the pointer in a
8189 // virtual register at the start of the function so it can be returned in the
8190 // epilogue.
8191 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8192 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8193 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8194 Ins[I].Flags.isInReg()) &&
8195 Ins[I].Flags.isSRet()) {
8196 assert(!FuncInfo->getSRetReturnReg());
8197
8198 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8199 Register Reg =
8201 FuncInfo->setSRetReturnReg(Reg);
8202
8203 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8204 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8205 break;
8206 }
8207 }
8208 }
8209
8210 unsigned StackArgSize = CCInfo.getStackSize();
8211 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8212 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8213 // This is a non-standard ABI so by fiat I say we're allowed to make full
8214 // use of the stack area to be popped, which must be aligned to 16 bytes in
8215 // any case:
8216 StackArgSize = alignTo(StackArgSize, 16);
8217
8218 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8219 // a multiple of 16.
8220 FuncInfo->setArgumentStackToRestore(StackArgSize);
8221
8222 // This realignment carries over to the available bytes below. Our own
8223 // callers will guarantee the space is free by giving an aligned value to
8224 // CALLSEQ_START.
8225 }
8226 // Even if we're not expected to free up the space, it's useful to know how
8227 // much is there while considering tail calls (because we can reuse it).
8228 FuncInfo->setBytesInStackArgArea(StackArgSize);
8229
8230 if (Subtarget->hasCustomCallingConv())
8232
8233 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8234 // will be expanded and stored in the static object later using a pseudonode.
8235 if (SMEAttrs(MF.getFunction()).hasZAState()) {
8236 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8237 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8238 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8239 DAG.getConstant(1, DL, MVT::i32));
8240
8241 SDValue Buffer;
8242 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8244 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8245 } else {
8246 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8247 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8248 DAG.getVTList(MVT::i64, MVT::Other),
8249 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8250 MFI.CreateVariableSizedObject(Align(16), nullptr);
8251 }
8252 Chain = DAG.getNode(
8253 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8254 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8255 } else if (SMEAttrs(MF.getFunction()).hasAgnosticZAInterface()) {
8256 // Call __arm_sme_state_size().
8257 SDValue BufferSize =
8259 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8260 Chain = BufferSize.getValue(1);
8261
8262 SDValue Buffer;
8263 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8264 Buffer =
8266 DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
8267 } else {
8268 // Allocate space dynamically.
8269 Buffer = DAG.getNode(
8270 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8271 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8272 MFI.CreateVariableSizedObject(Align(16), nullptr);
8273 }
8274
8275 // Copy the value to a virtual register, and save that in FuncInfo.
8276 Register BufferPtr =
8277 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8278 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8279 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8280 }
8281
8282 if (CallConv == CallingConv::PreserveNone) {
8283 for (const ISD::InputArg &I : Ins) {
8284 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8285 I.Flags.isSwiftAsync()) {
8288 MF.getFunction(),
8289 "Swift attributes can't be used with preserve_none",
8290 DL.getDebugLoc()));
8291 break;
8292 }
8293 }
8294 }
8295
8296 return Chain;
8297}
8298
8299void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8300 SelectionDAG &DAG,
8301 const SDLoc &DL,
8302 SDValue &Chain) const {
8304 MachineFrameInfo &MFI = MF.getFrameInfo();
8306 auto PtrVT = getPointerTy(DAG.getDataLayout());
8307 Function &F = MF.getFunction();
8308 bool IsWin64 =
8309 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8310
8312
8314 unsigned NumGPRArgRegs = GPRArgRegs.size();
8315 if (Subtarget->isWindowsArm64EC()) {
8316 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8317 // functions.
8318 NumGPRArgRegs = 4;
8319 }
8320 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8321
8322 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8323 int GPRIdx = 0;
8324 if (GPRSaveSize != 0) {
8325 if (IsWin64) {
8326 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8327 if (GPRSaveSize & 15)
8328 // The extra size here, if triggered, will always be 8.
8329 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8330 } else
8331 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8332
8333 SDValue FIN;
8334 if (Subtarget->isWindowsArm64EC()) {
8335 // With the Arm64EC ABI, we reserve the save area as usual, but we
8336 // compute its address relative to x4. For a normal AArch64->AArch64
8337 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8338 // different address.
8339 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8340 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8341 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8342 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8343 } else {
8344 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8345 }
8346
8347 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8348 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8349 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8350 SDValue Store =
8351 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8353 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8354 : MachinePointerInfo::getStack(MF, i * 8));
8355 MemOps.push_back(Store);
8356 FIN =
8357 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8358 }
8359 }
8360 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8361 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8362
8363 if (Subtarget->hasFPARMv8() && !IsWin64) {
8365 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8366 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8367
8368 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8369 int FPRIdx = 0;
8370 if (FPRSaveSize != 0) {
8371 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8372
8373 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8374
8375 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8376 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8377 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8378
8379 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8380 MachinePointerInfo::getStack(MF, i * 16));
8381 MemOps.push_back(Store);
8382 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8383 DAG.getConstant(16, DL, PtrVT));
8384 }
8385 }
8386 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8387 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8388 }
8389
8390 if (!MemOps.empty()) {
8391 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8392 }
8393}
8394
8395/// LowerCallResult - Lower the result values of a call into the
8396/// appropriate copies out of appropriate physical registers.
8397SDValue AArch64TargetLowering::LowerCallResult(
8398 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8399 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8400 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8401 SDValue ThisVal, bool RequiresSMChange) const {
8402 DenseMap<unsigned, SDValue> CopiedRegs;
8403 // Copy all of the result registers out of their specified physreg.
8404 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8405 CCValAssign VA = RVLocs[i];
8406
8407 // Pass 'this' value directly from the argument to return value, to avoid
8408 // reg unit interference
8409 if (i == 0 && isThisReturn) {
8410 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8411 "unexpected return calling convention register assignment");
8412 InVals.push_back(ThisVal);
8413 continue;
8414 }
8415
8416 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8417 // allows one use of a physreg per block.
8418 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8419 if (!Val) {
8420 Val =
8421 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8422 Chain = Val.getValue(1);
8423 InGlue = Val.getValue(2);
8424 CopiedRegs[VA.getLocReg()] = Val;
8425 }
8426
8427 switch (VA.getLocInfo()) {
8428 default:
8429 llvm_unreachable("Unknown loc info!");
8430 case CCValAssign::Full:
8431 break;
8432 case CCValAssign::BCvt:
8433 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8434 break;
8436 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8437 DAG.getConstant(32, DL, VA.getLocVT()));
8438 [[fallthrough]];
8439 case CCValAssign::AExt:
8440 [[fallthrough]];
8441 case CCValAssign::ZExt:
8442 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8443 break;
8444 }
8445
8446 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8448 Val);
8449
8450 InVals.push_back(Val);
8451 }
8452
8453 return Chain;
8454}
8455
8456/// Return true if the calling convention is one that we can guarantee TCO for.
8457static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8458 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8460}
8461
8462/// Return true if we might ever do TCO for calls with this calling convention.
8464 switch (CC) {
8465 case CallingConv::C:
8470 case CallingConv::Swift:
8472 case CallingConv::Tail:
8473 case CallingConv::Fast:
8474 return true;
8475 default:
8476 return false;
8477 }
8478}
8479
8480/// Return true if the call convention supports varargs
8481/// Currently only those that pass varargs like the C
8482/// calling convention does are eligible
8483/// Calling conventions listed in this function must also
8484/// be properly handled in AArch64Subtarget::isCallingConvWin64
8486 switch (CC) {
8487 case CallingConv::C:
8489 return true;
8490 default:
8491 return false;
8492 }
8493}
8494
8496 const AArch64Subtarget *Subtarget,
8498 CCState &CCInfo) {
8499 const SelectionDAG &DAG = CLI.DAG;
8500 CallingConv::ID CalleeCC = CLI.CallConv;
8501 bool IsVarArg = CLI.IsVarArg;
8502 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8503 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8504
8505 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8506 // for the shadow store.
8507 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8508 CCInfo.AllocateStack(32, Align(16));
8509
8510 unsigned NumArgs = Outs.size();
8511 for (unsigned i = 0; i != NumArgs; ++i) {
8512 MVT ArgVT = Outs[i].VT;
8513 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8514
8515 bool UseVarArgCC = false;
8516 if (IsVarArg) {
8517 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8518 // too, so use the vararg CC to force them to integer registers.
8519 if (IsCalleeWin64) {
8520 UseVarArgCC = true;
8521 } else {
8522 UseVarArgCC = !Outs[i].IsFixed;
8523 }
8524 }
8525
8526 if (!UseVarArgCC) {
8527 // Get type of the original argument.
8528 EVT ActualVT =
8529 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8530 /*AllowUnknown*/ true);
8531 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8532 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8533 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8534 ArgVT = MVT::i8;
8535 else if (ActualMVT == MVT::i16)
8536 ArgVT = MVT::i16;
8537 }
8538
8539 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8540 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8541 assert(!Res && "Call operand has unhandled type");
8542 (void)Res;
8543 }
8544}
8545
8546bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8547 const CallLoweringInfo &CLI) const {
8548 CallingConv::ID CalleeCC = CLI.CallConv;
8549 if (!mayTailCallThisCC(CalleeCC))
8550 return false;
8551
8552 SDValue Callee = CLI.Callee;
8553 bool IsVarArg = CLI.IsVarArg;
8554 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8555 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8556 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8557 const SelectionDAG &DAG = CLI.DAG;
8559 const Function &CallerF = MF.getFunction();
8560 CallingConv::ID CallerCC = CallerF.getCallingConv();
8561
8562 // SME Streaming functions are not eligible for TCO as they may require
8563 // the streaming mode or ZA to be restored after returning from the call.
8564 SMEAttrs CallerAttrs(MF.getFunction());
8565 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
8566 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
8567 CallerAttrs.requiresLazySave(CalleeAttrs) ||
8568 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs) ||
8569 CallerAttrs.hasStreamingBody())
8570 return false;
8571
8572 // Functions using the C or Fast calling convention that have an SVE signature
8573 // preserve more registers and should assume the SVE_VectorCall CC.
8574 // The check for matching callee-saved regs will determine whether it is
8575 // eligible for TCO.
8576 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8579
8580 bool CCMatch = CallerCC == CalleeCC;
8581
8582 // When using the Windows calling convention on a non-windows OS, we want
8583 // to back up and restore X18 in such functions; we can't do a tail call
8584 // from those functions.
8585 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8586 CalleeCC != CallingConv::Win64)
8587 return false;
8588
8589 // Byval parameters hand the function a pointer directly into the stack area
8590 // we want to reuse during a tail call. Working around this *is* possible (see
8591 // X86) but less efficient and uglier in LowerCall.
8592 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8593 e = CallerF.arg_end();
8594 i != e; ++i) {
8595 if (i->hasByValAttr())
8596 return false;
8597
8598 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8599 // In this case, it is necessary to save/restore X0 in the callee. Tail
8600 // call opt interferes with this. So we disable tail call opt when the
8601 // caller has an argument with "inreg" attribute.
8602
8603 // FIXME: Check whether the callee also has an "inreg" argument.
8604 if (i->hasInRegAttr())
8605 return false;
8606 }
8607
8608 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8609 return CCMatch;
8610
8611 // Externally-defined functions with weak linkage should not be
8612 // tail-called on AArch64 when the OS does not support dynamic
8613 // pre-emption of symbols, as the AAELF spec requires normal calls
8614 // to undefined weak functions to be replaced with a NOP or jump to the
8615 // next instruction. The behaviour of branch instructions in this
8616 // situation (as used for tail calls) is implementation-defined, so we
8617 // cannot rely on the linker replacing the tail call with a return.
8618 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8619 const GlobalValue *GV = G->getGlobal();
8621 if (GV->hasExternalWeakLinkage() &&
8622 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8623 return false;
8624 }
8625
8626 // Now we search for cases where we can use a tail call without changing the
8627 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8628 // concept.
8629
8630 // I want anyone implementing a new calling convention to think long and hard
8631 // about this assert.
8632 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8633 report_fatal_error("Unsupported variadic calling convention");
8634
8635 LLVMContext &C = *DAG.getContext();
8636 // Check that the call results are passed in the same way.
8637 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8638 CCAssignFnForCall(CalleeCC, IsVarArg),
8639 CCAssignFnForCall(CallerCC, IsVarArg)))
8640 return false;
8641 // The callee has to preserve all registers the caller needs to preserve.
8642 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8643 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8644 if (!CCMatch) {
8645 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8646 if (Subtarget->hasCustomCallingConv()) {
8647 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8648 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8649 }
8650 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8651 return false;
8652 }
8653
8654 // Nothing more to check if the callee is taking no arguments
8655 if (Outs.empty())
8656 return true;
8657
8659 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8660
8661 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8662
8663 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8664 // When we are musttail, additional checks have been done and we can safely ignore this check
8665 // At least two cases here: if caller is fastcc then we can't have any
8666 // memory arguments (we'd be expected to clean up the stack afterwards). If
8667 // caller is C then we could potentially use its argument area.
8668
8669 // FIXME: for now we take the most conservative of these in both cases:
8670 // disallow all variadic memory operands.
8671 for (const CCValAssign &ArgLoc : ArgLocs)
8672 if (!ArgLoc.isRegLoc())
8673 return false;
8674 }
8675
8676 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8677
8678 // If any of the arguments is passed indirectly, it must be SVE, so the
8679 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8680 // allocate space on the stack. That is why we determine this explicitly here
8681 // the call cannot be a tailcall.
8682 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8683 assert((A.getLocInfo() != CCValAssign::Indirect ||
8684 A.getValVT().isScalableVector() ||
8685 Subtarget->isWindowsArm64EC()) &&
8686 "Expected value to be scalable");
8687 return A.getLocInfo() == CCValAssign::Indirect;
8688 }))
8689 return false;
8690
8691 // If the stack arguments for this call do not fit into our own save area then
8692 // the call cannot be made tail.
8693 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8694 return false;
8695
8696 const MachineRegisterInfo &MRI = MF.getRegInfo();
8697 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8698 return false;
8699
8700 return true;
8701}
8702
8703SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8704 SelectionDAG &DAG,
8705 MachineFrameInfo &MFI,
8706 int ClobberedFI) const {
8707 SmallVector<SDValue, 8> ArgChains;
8708 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8709 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8710
8711 // Include the original chain at the beginning of the list. When this is
8712 // used by target LowerCall hooks, this helps legalize find the
8713 // CALLSEQ_BEGIN node.
8714 ArgChains.push_back(Chain);
8715
8716 // Add a chain value for each stack argument corresponding
8717 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8718 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8719 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8720 if (FI->getIndex() < 0) {
8721 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8722 int64_t InLastByte = InFirstByte;
8723 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8724
8725 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8726 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8727 ArgChains.push_back(SDValue(L, 1));
8728 }
8729
8730 // Build a tokenfactor for all the chains.
8731 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8732}
8733
8734bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8735 bool TailCallOpt) const {
8736 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8737 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8738}
8739
8740// Check if the value is zero-extended from i1 to i8
8741static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8742 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8743 if (SizeInBits < 8)
8744 return false;
8745
8746 APInt RequredZero(SizeInBits, 0xFE);
8747 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8748 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8749 return ZExtBool;
8750}
8751
8752// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
8753// input operands are copy nodes where the source register is in a
8754// StridedOrContiguous class. For example:
8755//
8756// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
8757// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
8758// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
8759// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
8760// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
8761// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
8762// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
8763//
8765 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
8766
8767 const TargetRegisterClass *RegClass = nullptr;
8768 switch (MI.getOpcode()) {
8769 case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
8770 RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
8771 break;
8772 case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
8773 RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
8774 break;
8775 default:
8776 llvm_unreachable("Unexpected opcode.");
8777 }
8778
8780 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8781 MachineOperand &MO = MI.getOperand(I);
8782 assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
8783
8784 MachineOperand *Def = MRI.getOneDef(MO.getReg());
8785 if (!Def || !Def->getParent()->isCopy())
8786 return false;
8787
8788 const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
8789 unsigned OpSubReg = CopySrc.getSubReg();
8791 SubReg = OpSubReg;
8792
8793 MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
8794 if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
8795 MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
8796 return false;
8797 }
8798
8799 return true;
8800}
8801
8802void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8803 SDNode *Node) const {
8804 // Live-in physreg copies that are glued to SMSTART are applied as
8805 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8806 // register allocator to pass call args in callee saved regs, without extra
8807 // copies to avoid these fake clobbers of actually-preserved GPRs.
8808 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8809 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8810 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8811 if (MachineOperand &MO = MI.getOperand(I);
8812 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8813 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8814 AArch64::GPR64RegClass.contains(MO.getReg())))
8815 MI.removeOperand(I);
8816
8817 // The SVE vector length can change when entering/leaving streaming mode.
8818 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8819 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8820 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8821 /*IsImplicit=*/true));
8822 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8823 /*IsImplicit=*/true));
8824 }
8825 }
8826
8827 if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8828 MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
8829 // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
8830 // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
8832 return;
8833
8834 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8835 MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
8836 TII->get(TargetOpcode::REG_SEQUENCE),
8837 MI.getOperand(0).getReg());
8838
8839 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8840 MIB.add(MI.getOperand(I));
8841 MIB.addImm(AArch64::zsub0 + (I - 1));
8842 }
8843
8844 MI.eraseFromParent();
8845 return;
8846 }
8847
8848 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8849 // have nothing to do with VG, were it not that they are used to materialise a
8850 // frame-address. If they contain a frame-index to a scalable vector, this
8851 // will likely require an ADDVL instruction to materialise the address, thus
8852 // reading VG.
8853 const MachineFunction &MF = *MI.getMF();
8855 (MI.getOpcode() == AArch64::ADDXri ||
8856 MI.getOpcode() == AArch64::SUBXri)) {
8857 const MachineOperand &MO = MI.getOperand(1);
8858 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8860 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8861 /*IsImplicit=*/true));
8862 }
8863}
8864
8866 bool Enable, SDValue Chain,
8867 SDValue InGlue,
8868 unsigned Condition,
8869 SDValue PStateSM) const {
8872 FuncInfo->setHasStreamingModeChanges(true);
8873
8874 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8875 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8876 SDValue MSROp =
8877 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8878 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8879 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8880 if (Condition != AArch64SME::Always) {
8881 assert(PStateSM && "PStateSM should be defined");
8882 Ops.push_back(PStateSM);
8883 }
8884 Ops.push_back(RegMask);
8885
8886 if (InGlue)
8887 Ops.push_back(InGlue);
8888
8889 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8890 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8891}
8892
8893// Emit a call to __arm_sme_save or __arm_sme_restore.
8895 SelectionDAG &DAG,
8897 SDValue Chain, bool IsSave) {
8900 FuncInfo->setSMESaveBufferUsed();
8901
8904 Entry.Ty = PointerType::getUnqual(*DAG.getContext());
8905 Entry.Node =
8906 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
8907 Args.push_back(Entry);
8908
8909 SDValue Callee =
8910 DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8911 TLI.getPointerTy(DAG.getDataLayout()));
8912 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8914 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8916 Callee, std::move(Args));
8917 return TLI.LowerCallTo(CLI).second;
8918}
8919
8920static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8921 const SMEAttrs &CalleeAttrs) {
8922 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8923 CallerAttrs.hasStreamingBody())
8924 return AArch64SME::Always;
8925 if (CalleeAttrs.hasNonStreamingInterface())
8927 if (CalleeAttrs.hasStreamingInterface())
8929
8930 llvm_unreachable("Unsupported attributes");
8931}
8932
8933/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8934/// and add input and output parameter nodes.
8935SDValue
8936AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8937 SmallVectorImpl<SDValue> &InVals) const {
8938 SelectionDAG &DAG = CLI.DAG;
8939 SDLoc &DL = CLI.DL;
8940 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8941 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8943 SDValue Chain = CLI.Chain;
8944 SDValue Callee = CLI.Callee;
8945 bool &IsTailCall = CLI.IsTailCall;
8946 CallingConv::ID &CallConv = CLI.CallConv;
8947 bool IsVarArg = CLI.IsVarArg;
8948
8951 bool IsThisReturn = false;
8952
8954 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8955 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8956 bool IsSibCall = false;
8957 bool GuardWithBTI = false;
8958
8959 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8960 !Subtarget->noBTIAtReturnTwice()) {
8961 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8962 }
8963
8964 // Analyze operands of the call, assigning locations to each operand.
8966 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8967
8968 if (IsVarArg) {
8969 unsigned NumArgs = Outs.size();
8970
8971 for (unsigned i = 0; i != NumArgs; ++i) {
8972 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8973 report_fatal_error("Passing SVE types to variadic functions is "
8974 "currently not supported");
8975 }
8976 }
8977
8978 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8979
8980 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8981 // Assign locations to each value returned by this call.
8983 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8984 *DAG.getContext());
8985 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8986
8987 // Check callee args/returns for SVE registers and set calling convention
8988 // accordingly.
8989 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8990 auto HasSVERegLoc = [](CCValAssign &Loc) {
8991 if (!Loc.isRegLoc())
8992 return false;
8993 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8994 AArch64::PPRRegClass.contains(Loc.getLocReg());
8995 };
8996 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8998 }
8999
9000 if (IsTailCall) {
9001 // Check if it's really possible to do a tail call.
9002 IsTailCall = isEligibleForTailCallOptimization(CLI);
9003
9004 // A sibling call is one where we're under the usual C ABI and not planning
9005 // to change that but can still do a tail call:
9006 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
9007 CallConv != CallingConv::SwiftTail)
9008 IsSibCall = true;
9009
9010 if (IsTailCall)
9011 ++NumTailCalls;
9012 }
9013
9014 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9015 report_fatal_error("failed to perform tail call elimination on a call "
9016 "site marked musttail");
9017
9018 // Get a count of how many bytes are to be pushed on the stack.
9019 unsigned NumBytes = CCInfo.getStackSize();
9020
9021 if (IsSibCall) {
9022 // Since we're not changing the ABI to make this a tail call, the memory
9023 // operands are already available in the caller's incoming argument space.
9024 NumBytes = 0;
9025 }
9026
9027 // FPDiff is the byte offset of the call's argument area from the callee's.
9028 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9029 // by this amount for a tail call. In a sibling call it must be 0 because the
9030 // caller will deallocate the entire stack and the callee still expects its
9031 // arguments to begin at SP+0. Completely unused for non-tail calls.
9032 int FPDiff = 0;
9033
9034 if (IsTailCall && !IsSibCall) {
9035 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9036
9037 // Since callee will pop argument stack as a tail call, we must keep the
9038 // popped size 16-byte aligned.
9039 NumBytes = alignTo(NumBytes, 16);
9040
9041 // FPDiff will be negative if this tail call requires more space than we
9042 // would automatically have in our incoming argument space. Positive if we
9043 // can actually shrink the stack.
9044 FPDiff = NumReusableBytes - NumBytes;
9045
9046 // Update the required reserved area if this is the tail call requiring the
9047 // most argument stack space.
9048 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9049 FuncInfo->setTailCallReservedStack(-FPDiff);
9050
9051 // The stack pointer must be 16-byte aligned at all times it's used for a
9052 // memory operation, which in practice means at *all* times and in
9053 // particular across call boundaries. Therefore our own arguments started at
9054 // a 16-byte aligned SP and the delta applied for the tail call should
9055 // satisfy the same constraint.
9056 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9057 }
9058
9059 // Determine whether we need any streaming mode changes.
9060 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
9061 if (CLI.CB)
9062 CalleeAttrs = SMEAttrs(*CLI.CB);
9063 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9064 CalleeAttrs = SMEAttrs(ES->getSymbol());
9065
9066 auto DescribeCallsite =
9068 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9069 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9070 R << ore::NV("Callee", ES->getSymbol());
9071 else if (CLI.CB && CLI.CB->getCalledFunction())
9072 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9073 else
9074 R << "unknown callee";
9075 R << "'";
9076 return R;
9077 };
9078
9079 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
9080 bool RequiresSaveAllZA =
9081 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs);
9082 if (RequiresLazySave) {
9083 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9084 MachinePointerInfo MPI =
9086 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9087 TPIDR2.FrameIndex,
9089 SDValue NumZaSaveSlicesAddr =
9090 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9091 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9092 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9093 DAG.getConstant(1, DL, MVT::i32));
9094 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9095 MPI, MVT::i16);
9096 Chain = DAG.getNode(
9097 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9098 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9099 TPIDR2ObjAddr);
9101 ORE.emit([&]() {
9102 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9103 CLI.CB)
9104 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9105 &MF.getFunction());
9106 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9107 });
9108 } else if (RequiresSaveAllZA) {
9109 assert(!CalleeAttrs.hasSharedZAInterface() &&
9110 "Cannot share state that may not exist");
9111 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9112 /*IsSave=*/true);
9113 }
9114
9115 SDValue PStateSM;
9116 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
9117 if (RequiresSMChange) {
9118 if (CallerAttrs.hasStreamingInterfaceOrBody())
9119 PStateSM = DAG.getConstant(1, DL, MVT::i64);
9120 else if (CallerAttrs.hasNonStreamingInterface())
9121 PStateSM = DAG.getConstant(0, DL, MVT::i64);
9122 else
9123 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
9125 ORE.emit([&]() {
9126 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9127 CLI.CB)
9128 : OptimizationRemarkAnalysis("sme", "SMETransition",
9129 &MF.getFunction());
9130 DescribeCallsite(R) << " requires a streaming mode transition";
9131 return R;
9132 });
9133 }
9134
9135 SDValue ZTFrameIdx;
9136 MachineFrameInfo &MFI = MF.getFrameInfo();
9137 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
9138
9139 // If the caller has ZT0 state which will not be preserved by the callee,
9140 // spill ZT0 before the call.
9141 if (ShouldPreserveZT0) {
9142 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9143 ZTFrameIdx = DAG.getFrameIndex(
9144 ZTObj,
9146
9147 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9148 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9149 }
9150
9151 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9152 // PSTATE.ZA before the call if there is no lazy-save active.
9153 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
9154 assert((!DisableZA || !RequiresLazySave) &&
9155 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9156
9157 if (DisableZA)
9158 Chain = DAG.getNode(
9159 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
9160 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9161 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9162
9163 // Adjust the stack pointer for the new arguments...
9164 // These operations are automatically eliminated by the prolog/epilog pass
9165 if (!IsSibCall)
9166 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9167
9168 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9170
9172 SmallSet<unsigned, 8> RegsUsed;
9173 SmallVector<SDValue, 8> MemOpChains;
9174 auto PtrVT = getPointerTy(DAG.getDataLayout());
9175
9176 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9177 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9178 for (const auto &F : Forwards) {
9179 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9180 RegsToPass.emplace_back(F.PReg, Val);
9181 }
9182 }
9183
9184 // Walk the register/memloc assignments, inserting copies/loads.
9185 unsigned ExtraArgLocs = 0;
9186 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9187 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9188 SDValue Arg = OutVals[i];
9189 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9190
9191 // Promote the value if needed.
9192 switch (VA.getLocInfo()) {
9193 default:
9194 llvm_unreachable("Unknown loc info!");
9195 case CCValAssign::Full:
9196 break;
9197 case CCValAssign::SExt:
9198 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9199 break;
9200 case CCValAssign::ZExt:
9201 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9202 break;
9203 case CCValAssign::AExt:
9204 if (Outs[i].ArgVT == MVT::i1) {
9205 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9206 //
9207 // Check if we actually have to do this, because the value may
9208 // already be zero-extended.
9209 //
9210 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9211 // and rely on DAGCombiner to fold this, because the following
9212 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9213 //
9214 // (ext (zext x)) -> (zext x)
9215 //
9216 // This will give us (zext i32), which we cannot remove, so
9217 // try to check this beforehand.
9218 if (!checkZExtBool(Arg, DAG)) {
9219 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9220 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9221 }
9222 }
9223 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9224 break;
9226 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9227 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9228 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9229 DAG.getConstant(32, DL, VA.getLocVT()));
9230 break;
9231 case CCValAssign::BCvt:
9232 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9233 break;
9234 case CCValAssign::Trunc:
9235 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9236 break;
9237 case CCValAssign::FPExt:
9238 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9239 break;
9241 bool isScalable = VA.getValVT().isScalableVT();
9242 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9243 "Indirect arguments should be scalable on most subtargets");
9244
9245 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9246 uint64_t PartSize = StoreSize;
9247 unsigned NumParts = 1;
9248 if (Outs[i].Flags.isInConsecutiveRegs()) {
9249 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9250 ++NumParts;
9251 StoreSize *= NumParts;
9252 }
9253
9254 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9255 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9256 MachineFrameInfo &MFI = MF.getFrameInfo();
9257 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9258 if (isScalable)
9260
9264 SDValue SpillSlot = Ptr;
9265
9266 // Ensure we generate all stores for each tuple part, whilst updating the
9267 // pointer after each store correctly using vscale.
9268 while (NumParts) {
9269 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9270 MemOpChains.push_back(Store);
9271
9272 NumParts--;
9273 if (NumParts > 0) {
9274 SDValue BytesIncrement;
9275 if (isScalable) {
9276 BytesIncrement = DAG.getVScale(
9277 DL, Ptr.getValueType(),
9278 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9279 } else {
9280 BytesIncrement = DAG.getConstant(
9281 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9282 Ptr.getValueType());
9283 }
9284 MPI = MachinePointerInfo(MPI.getAddrSpace());
9285 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9286 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9287 ExtraArgLocs++;
9288 i++;
9289 }
9290 }
9291
9292 Arg = SpillSlot;
9293 break;
9294 }
9295
9296 if (VA.isRegLoc()) {
9297 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9298 Outs[0].VT == MVT::i64) {
9299 assert(VA.getLocVT() == MVT::i64 &&
9300 "unexpected calling convention register assignment");
9301 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9302 "unexpected use of 'returned'");
9303 IsThisReturn = true;
9304 }
9305 if (RegsUsed.count(VA.getLocReg())) {
9306 // If this register has already been used then we're trying to pack
9307 // parts of an [N x i32] into an X-register. The extension type will
9308 // take care of putting the two halves in the right place but we have to
9309 // combine them.
9310 SDValue &Bits =
9311 llvm::find_if(RegsToPass,
9312 [=](const std::pair<unsigned, SDValue> &Elt) {
9313 return Elt.first == VA.getLocReg();
9314 })
9315 ->second;
9316 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9317 // Call site info is used for function's parameter entry value
9318 // tracking. For now we track only simple cases when parameter
9319 // is transferred through whole register.
9321 [&VA](MachineFunction::ArgRegPair ArgReg) {
9322 return ArgReg.Reg == VA.getLocReg();
9323 });
9324 } else {
9325 // Add an extra level of indirection for streaming mode changes by
9326 // using a pseudo copy node that cannot be rematerialised between a
9327 // smstart/smstop and the call by the simple register coalescer.
9328 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9330 Arg.getValueType(), Arg);
9331 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9332 RegsUsed.insert(VA.getLocReg());
9333 const TargetOptions &Options = DAG.getTarget().Options;
9334 if (Options.EmitCallSiteInfo)
9335 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9336 }
9337 } else {
9338 assert(VA.isMemLoc());
9339
9340 SDValue DstAddr;
9341 MachinePointerInfo DstInfo;
9342
9343 // FIXME: This works on big-endian for composite byvals, which are the
9344 // common case. It should also work for fundamental types too.
9345 uint32_t BEAlign = 0;
9346 unsigned OpSize;
9347 if (VA.getLocInfo() == CCValAssign::Indirect ||
9349 OpSize = VA.getLocVT().getFixedSizeInBits();
9350 else
9351 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9352 : VA.getValVT().getSizeInBits();
9353 OpSize = (OpSize + 7) / 8;
9354 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9355 !Flags.isInConsecutiveRegs()) {
9356 if (OpSize < 8)
9357 BEAlign = 8 - OpSize;
9358 }
9359 unsigned LocMemOffset = VA.getLocMemOffset();
9360 int32_t Offset = LocMemOffset + BEAlign;
9361 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9362 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9363
9364 if (IsTailCall) {
9365 Offset = Offset + FPDiff;
9366 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9367
9368 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9369 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9370
9371 // Make sure any stack arguments overlapping with where we're storing
9372 // are loaded before this eventual operation. Otherwise they'll be
9373 // clobbered.
9374 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9375 } else {
9376 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9377
9378 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9379 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9380 }
9381
9382 if (Outs[i].Flags.isByVal()) {
9383 SDValue SizeNode =
9384 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9385 SDValue Cpy = DAG.getMemcpy(
9386 Chain, DL, DstAddr, Arg, SizeNode,
9387 Outs[i].Flags.getNonZeroByValAlign(),
9388 /*isVol = */ false, /*AlwaysInline = */ false,
9389 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9390
9391 MemOpChains.push_back(Cpy);
9392 } else {
9393 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9394 // promoted to a legal register type i32, we should truncate Arg back to
9395 // i1/i8/i16.
9396 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9397 VA.getValVT() == MVT::i16)
9398 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9399
9400 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9401 MemOpChains.push_back(Store);
9402 }
9403 }
9404 }
9405
9406 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
9407 SDValue ParamPtr = StackPtr;
9408 if (IsTailCall) {
9409 // Create a dummy object at the top of the stack that can be used to get
9410 // the SP after the epilogue
9411 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9412 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9413 }
9414
9415 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9416 // describing the argument list. x4 contains the address of the
9417 // first stack parameter. x5 contains the size in bytes of all parameters
9418 // passed on the stack.
9419 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9420 RegsToPass.emplace_back(AArch64::X5,
9421 DAG.getConstant(NumBytes, DL, MVT::i64));
9422 }
9423
9424 if (!MemOpChains.empty())
9425 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9426
9427 SDValue InGlue;
9428 if (RequiresSMChange) {
9429 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9430 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
9431 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
9432 InGlue = Chain.getValue(1);
9433 }
9434
9435 SDValue NewChain = changeStreamingMode(
9436 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
9437 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9438 Chain = NewChain.getValue(0);
9439 InGlue = NewChain.getValue(1);
9440 }
9441
9442 // Build a sequence of copy-to-reg nodes chained together with token chain
9443 // and flag operands which copy the outgoing args into the appropriate regs.
9444 for (auto &RegToPass : RegsToPass) {
9445 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9446 RegToPass.second, InGlue);
9447 InGlue = Chain.getValue(1);
9448 }
9449
9450 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9451 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9452 // node so that legalize doesn't hack it.
9453 const GlobalValue *CalledGlobal = nullptr;
9454 unsigned OpFlags = 0;
9455 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9456 CalledGlobal = G->getGlobal();
9457 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9459 if (OpFlags & AArch64II::MO_GOT) {
9460 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9461 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9462 } else {
9463 const GlobalValue *GV = G->getGlobal();
9464 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9465 }
9466 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9467 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9468 Subtarget->isTargetMachO()) ||
9470 const char *Sym = S->getSymbol();
9471 if (UseGot) {
9473 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9474 } else {
9475 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9476 }
9477 }
9478
9479 // We don't usually want to end the call-sequence here because we would tidy
9480 // the frame up *after* the call, however in the ABI-changing tail-call case
9481 // we've carefully laid out the parameters so that when sp is reset they'll be
9482 // in the correct location.
9483 if (IsTailCall && !IsSibCall) {
9484 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9485 InGlue = Chain.getValue(1);
9486 }
9487
9488 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9489
9490 std::vector<SDValue> Ops;
9491 Ops.push_back(Chain);
9492 Ops.push_back(Callee);
9493
9494 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9495 // be expanded to the call, directly followed by a special marker sequence and
9496 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9497 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9498 assert(!IsTailCall &&
9499 "tail calls cannot be marked with clang.arc.attachedcall");
9501
9502 // Add a target global address for the retainRV/claimRV runtime function
9503 // just before the call target.
9504 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9505 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9506 Ops.insert(Ops.begin() + 1, GA);
9507 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9509 } else if (GuardWithBTI) {
9511 }
9512
9513 if (IsTailCall) {
9514 // Each tail call may have to adjust the stack by a different amount, so
9515 // this information must travel along with the operation for eventual
9516 // consumption by emitEpilogue.
9517 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9518 }
9519
9520 if (CLI.PAI) {
9521 const uint64_t Key = CLI.PAI->Key;
9522 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9523 "Invalid auth call key");
9524
9525 // Split the discriminator into address/integer components.
9526 SDValue AddrDisc, IntDisc;
9527 std::tie(IntDisc, AddrDisc) =
9528 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9529
9530 if (Opc == AArch64ISD::CALL_RVMARKER)
9532 else
9534 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9535 Ops.push_back(IntDisc);
9536 Ops.push_back(AddrDisc);
9537 }
9538
9539 // Add argument registers to the end of the list so that they are known live
9540 // into the call.
9541 for (auto &RegToPass : RegsToPass)
9542 Ops.push_back(DAG.getRegister(RegToPass.first,
9543 RegToPass.second.getValueType()));
9544
9545 // Add a register mask operand representing the call-preserved registers.
9546 const uint32_t *Mask;
9547 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9548 if (IsThisReturn) {
9549 // For 'this' returns, use the X0-preserving mask if applicable
9550 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9551 if (!Mask) {
9552 IsThisReturn = false;
9553 Mask = TRI->getCallPreservedMask(MF, CallConv);
9554 }
9555 } else
9556 Mask = TRI->getCallPreservedMask(MF, CallConv);
9557
9558 if (Subtarget->hasCustomCallingConv())
9559 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9560
9561 if (TRI->isAnyArgRegReserved(MF))
9562 TRI->emitReservedArgRegCallError(MF);
9563
9564 assert(Mask && "Missing call preserved mask for calling convention");
9565 Ops.push_back(DAG.getRegisterMask(Mask));
9566
9567 if (InGlue.getNode())
9568 Ops.push_back(InGlue);
9569
9570 // If we're doing a tall call, use a TC_RETURN here rather than an
9571 // actual call instruction.
9572 if (IsTailCall) {
9574 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9575 if (IsCFICall)
9576 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9577
9578 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9579 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9580 if (CalledGlobal)
9581 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9582 return Ret;
9583 }
9584
9585 // Returns a chain and a flag for retval copy to use.
9586 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9587 if (IsCFICall)
9588 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9589
9590 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9591 InGlue = Chain.getValue(1);
9592 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9593 if (CalledGlobal)
9594 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9595
9596 uint64_t CalleePopBytes =
9597 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9598
9599 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9600 InGlue = Chain.getValue(1);
9601
9602 // Handle result values, copying them out of physregs into vregs that we
9603 // return.
9604 SDValue Result = LowerCallResult(
9605 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9606 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9607
9608 if (!Ins.empty())
9609 InGlue = Result.getValue(Result->getNumValues() - 1);
9610
9611 if (RequiresSMChange) {
9612 assert(PStateSM && "Expected a PStateSM to be set");
9614 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
9615 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9616
9617 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9618 InGlue = Result.getValue(1);
9619 Result =
9621 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
9622 }
9623 }
9624
9625 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
9626 // Unconditionally resume ZA.
9627 Result = DAG.getNode(
9628 AArch64ISD::SMSTART, DL, MVT::Other, Result,
9629 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9630 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9631
9632 if (ShouldPreserveZT0)
9633 Result =
9634 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9635 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9636
9637 if (RequiresLazySave) {
9638 // Conditionally restore the lazy save using a pseudo node.
9639 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9640 SDValue RegMask = DAG.getRegisterMask(
9641 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9642 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9643 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
9644 SDValue TPIDR2_EL0 = DAG.getNode(
9645 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9646 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9647
9648 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9649 // RESTORE_ZA pseudo.
9650 SDValue Glue;
9651 SDValue TPIDR2Block = DAG.getFrameIndex(
9652 TPIDR2.FrameIndex,
9654 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9655 Result =
9656 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9657 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9658 RestoreRoutine, RegMask, Result.getValue(1)});
9659
9660 // Finally reset the TPIDR2_EL0 register to 0.
9661 Result = DAG.getNode(
9662 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9663 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9664 DAG.getConstant(0, DL, MVT::i64));
9665 TPIDR2.Uses++;
9666 } else if (RequiresSaveAllZA) {
9667 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9668 /*IsSave=*/false);
9669 }
9670
9671 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9672 RequiresSaveAllZA) {
9673 for (unsigned I = 0; I < InVals.size(); ++I) {
9674 // The smstart/smstop is chained as part of the call, but when the
9675 // resulting chain is discarded (which happens when the call is not part
9676 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9677 // smstart/smstop is chained to the result value. We can do that by doing
9678 // a vreg -> vreg copy.
9681 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9682 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9683 InVals[I].getValueType());
9684 }
9685 }
9686
9687 if (CallConv == CallingConv::PreserveNone) {
9688 for (const ISD::OutputArg &O : Outs) {
9689 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9690 O.Flags.isSwiftAsync()) {
9693 MF.getFunction(),
9694 "Swift attributes can't be used with preserve_none",
9695 DL.getDebugLoc()));
9696 break;
9697 }
9698 }
9699 }
9700
9701 return Result;
9702}
9703
9704bool AArch64TargetLowering::CanLowerReturn(
9705 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9706 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
9707 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9709 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9710 return CCInfo.CheckReturn(Outs, RetCC);
9711}
9712
9713SDValue
9714AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9715 bool isVarArg,
9717 const SmallVectorImpl<SDValue> &OutVals,
9718 const SDLoc &DL, SelectionDAG &DAG) const {
9719 auto &MF = DAG.getMachineFunction();
9720 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9721
9722 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9724 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9725 CCInfo.AnalyzeReturn(Outs, RetCC);
9726
9727 // Copy the result values into the output registers.
9728 SDValue Glue;
9730 SmallSet<unsigned, 4> RegsUsed;
9731 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9732 ++i, ++realRVLocIdx) {
9733 CCValAssign &VA = RVLocs[i];
9734 assert(VA.isRegLoc() && "Can only return in registers!");
9735 SDValue Arg = OutVals[realRVLocIdx];
9736
9737 switch (VA.getLocInfo()) {
9738 default:
9739 llvm_unreachable("Unknown loc info!");
9740 case CCValAssign::Full:
9741 if (Outs[i].ArgVT == MVT::i1) {
9742 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9743 // value. This is strictly redundant on Darwin (which uses "zeroext
9744 // i1"), but will be optimised out before ISel.
9745 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9746 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9747 }
9748 break;
9749 case CCValAssign::BCvt:
9750 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9751 break;
9752 case CCValAssign::AExt:
9753 case CCValAssign::ZExt:
9754 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9755 break;
9757 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9758 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9759 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9760 DAG.getConstant(32, DL, VA.getLocVT()));
9761 break;
9762 }
9763
9764 if (RegsUsed.count(VA.getLocReg())) {
9765 SDValue &Bits =
9766 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9767 return Elt.first == VA.getLocReg();
9768 })->second;
9769 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9770 } else {
9771 RetVals.emplace_back(VA.getLocReg(), Arg);
9772 RegsUsed.insert(VA.getLocReg());
9773 }
9774 }
9775
9776 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9777
9778 // Emit SMSTOP before returning from a locally streaming function
9779 SMEAttrs FuncAttrs(MF.getFunction());
9780 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9781 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9782 Register Reg = FuncInfo->getPStateSMReg();
9783 assert(Reg.isValid() && "PStateSM Register is invalid");
9784 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
9785 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9786 /*Glue*/ SDValue(),
9788 } else
9789 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9790 /*Glue*/ SDValue(), AArch64SME::Always);
9791 Glue = Chain.getValue(1);
9792 }
9793
9794 SmallVector<SDValue, 4> RetOps(1, Chain);
9795 for (auto &RetVal : RetVals) {
9796 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9797 isPassedInFPR(RetVal.second.getValueType()))
9798 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9799 RetVal.second.getValueType(), RetVal.second);
9800 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9801 Glue = Chain.getValue(1);
9802 RetOps.push_back(
9803 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9804 }
9805
9806 // Windows AArch64 ABIs require that for returning structs by value we copy
9807 // the sret argument into X0 for the return.
9808 // We saved the argument into a virtual register in the entry block,
9809 // so now we copy the value out and into X0.
9810 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9811 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9813
9814 unsigned RetValReg = AArch64::X0;
9815 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9816 RetValReg = AArch64::X8;
9817 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9818 Glue = Chain.getValue(1);
9819
9820 RetOps.push_back(
9821 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9822 }
9823
9824 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9825 if (I) {
9826 for (; *I; ++I) {
9827 if (AArch64::GPR64RegClass.contains(*I))
9828 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9829 else if (AArch64::FPR64RegClass.contains(*I))
9830 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9831 else
9832 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9833 }
9834 }
9835
9836 RetOps[0] = Chain; // Update chain.
9837
9838 // Add the glue if we have it.
9839 if (Glue.getNode())
9840 RetOps.push_back(Glue);
9841
9842 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9843 // ARM64EC entry thunks use a special return sequence: instead of a regular
9844 // "ret" instruction, they need to explicitly call the emulator.
9845 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9846 SDValue Arm64ECRetDest =
9847 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9848 Arm64ECRetDest =
9849 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9850 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9852 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9853 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9854 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9855 }
9856
9857 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9858}
9859
9860//===----------------------------------------------------------------------===//
9861// Other Lowering Code
9862//===----------------------------------------------------------------------===//
9863
9864SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9865 SelectionDAG &DAG,
9866 unsigned Flag) const {
9867 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9868 N->getOffset(), Flag);
9869}
9870
9871SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9872 SelectionDAG &DAG,
9873 unsigned Flag) const {
9874 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9875}
9876
9877SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9878 SelectionDAG &DAG,
9879 unsigned Flag) const {
9880 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9881 N->getOffset(), Flag);
9882}
9883
9884SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9885 SelectionDAG &DAG,
9886 unsigned Flag) const {
9887 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9888}
9889
9890SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9891 SelectionDAG &DAG,
9892 unsigned Flag) const {
9893 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9894}
9895
9896// (loadGOT sym)
9897template <class NodeTy>
9898SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9899 unsigned Flags) const {
9900 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9901 SDLoc DL(N);
9902 EVT Ty = getPointerTy(DAG.getDataLayout());
9903 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9904 // FIXME: Once remat is capable of dealing with instructions with register
9905 // operands, expand this into two nodes instead of using a wrapper node.
9906 if (DAG.getMachineFunction()
9908 ->hasELFSignedGOT())
9909 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
9910 0);
9911 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9912}
9913
9914// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9915template <class NodeTy>
9916SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9917 unsigned Flags) const {
9918 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9919 SDLoc DL(N);
9920 EVT Ty = getPointerTy(DAG.getDataLayout());
9921 const unsigned char MO_NC = AArch64II::MO_NC;
9922 return DAG.getNode(
9924 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9925 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9926 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9927 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9928}
9929
9930// (addlow (adrp %hi(sym)) %lo(sym))
9931template <class NodeTy>
9932SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9933 unsigned Flags) const {
9934 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9935 SDLoc DL(N);
9936 EVT Ty = getPointerTy(DAG.getDataLayout());
9937 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9938 SDValue Lo = getTargetNode(N, Ty, DAG,
9941 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9942}
9943
9944// (adr sym)
9945template <class NodeTy>
9946SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9947 unsigned Flags) const {
9948 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9949 SDLoc DL(N);
9950 EVT Ty = getPointerTy(DAG.getDataLayout());
9951 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9952 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9953}
9954
9955SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9956 SelectionDAG &DAG) const {
9957 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9958 const GlobalValue *GV = GN->getGlobal();
9959 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9960
9961 if (OpFlags != AArch64II::MO_NO_FLAG)
9962 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9963 "unexpected offset in global node");
9964
9965 // This also catches the large code model case for Darwin, and tiny code
9966 // model with got relocations.
9967 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9968 return getGOT(GN, DAG, OpFlags);
9969 }
9970
9974 Result = getAddrLarge(GN, DAG, OpFlags);
9975 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9976 Result = getAddrTiny(GN, DAG, OpFlags);
9977 } else {
9978 Result = getAddr(GN, DAG, OpFlags);
9979 }
9980 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9981 SDLoc DL(GN);
9983 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9985 return Result;
9986}
9987
9988/// Convert a TLS address reference into the correct sequence of loads
9989/// and calls to compute the variable's address (for Darwin, currently) and
9990/// return an SDValue containing the final node.
9991
9992/// Darwin only has one TLS scheme which must be capable of dealing with the
9993/// fully general situation, in the worst case. This means:
9994/// + "extern __thread" declaration.
9995/// + Defined in a possibly unknown dynamic library.
9996///
9997/// The general system is that each __thread variable has a [3 x i64] descriptor
9998/// which contains information used by the runtime to calculate the address. The
9999/// only part of this the compiler needs to know about is the first xword, which
10000/// contains a function pointer that must be called with the address of the
10001/// entire descriptor in "x0".
10002///
10003/// Since this descriptor may be in a different unit, in general even the
10004/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10005/// is:
10006/// adrp x0, _var@TLVPPAGE
10007/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10008/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10009/// ; the function pointer
10010/// blr x1 ; Uses descriptor address in x0
10011/// ; Address of _var is now in x0.
10012///
10013/// If the address of _var's descriptor *is* known to the linker, then it can
10014/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10015/// a slight efficiency gain.
10016SDValue
10017AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10018 SelectionDAG &DAG) const {
10019 assert(Subtarget->isTargetDarwin() &&
10020 "This function expects a Darwin target");
10021
10022 SDLoc DL(Op);
10023 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10024 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10025 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10026
10027 SDValue TLVPAddr =
10028 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10029 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10030
10031 // The first entry in the descriptor is a function pointer that we must call
10032 // to obtain the address of the variable.
10033 SDValue Chain = DAG.getEntryNode();
10034 SDValue FuncTLVGet = DAG.getLoad(
10035 PtrMemVT, DL, Chain, DescAddr,
10037 Align(PtrMemVT.getSizeInBits() / 8),
10039 Chain = FuncTLVGet.getValue(1);
10040
10041 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10042 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10043
10045 MFI.setAdjustsStack(true);
10046
10047 // TLS calls preserve all registers except those that absolutely must be
10048 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10049 // silly).
10050 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10051 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10052 if (Subtarget->hasCustomCallingConv())
10053 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10054
10055 // Finally, we can make the call. This is just a degenerate version of a
10056 // normal AArch64 call node: x0 takes the address of the descriptor, and
10057 // returns the address of the variable in this thread.
10058 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10059
10060 unsigned Opcode = AArch64ISD::CALL;
10062 Ops.push_back(Chain);
10063 Ops.push_back(FuncTLVGet);
10064
10065 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10066 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10067 Opcode = AArch64ISD::AUTH_CALL;
10068 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10069 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10070 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10071 }
10072
10073 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10074 Ops.push_back(DAG.getRegisterMask(Mask));
10075 Ops.push_back(Chain.getValue(1));
10076 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10077 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10078}
10079
10080/// Convert a thread-local variable reference into a sequence of instructions to
10081/// compute the variable's address for the local exec TLS model of ELF targets.
10082/// The sequence depends on the maximum TLS area size.
10083SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10084 SDValue ThreadBase,
10085 const SDLoc &DL,
10086 SelectionDAG &DAG) const {
10087 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10088 SDValue TPOff, Addr;
10089
10090 switch (DAG.getTarget().Options.TLSSize) {
10091 default:
10092 llvm_unreachable("Unexpected TLS size");
10093
10094 case 12: {
10095 // mrs x0, TPIDR_EL0
10096 // add x0, x0, :tprel_lo12:a
10098 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10099 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10100 Var,
10101 DAG.getTargetConstant(0, DL, MVT::i32)),
10102 0);
10103 }
10104
10105 case 24: {
10106 // mrs x0, TPIDR_EL0
10107 // add x0, x0, :tprel_hi12:a
10108 // add x0, x0, :tprel_lo12_nc:a
10109 SDValue HiVar = DAG.getTargetGlobalAddress(
10110 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10111 SDValue LoVar = DAG.getTargetGlobalAddress(
10112 GV, DL, PtrVT, 0,
10114 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10115 HiVar,
10116 DAG.getTargetConstant(0, DL, MVT::i32)),
10117 0);
10118 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10119 LoVar,
10120 DAG.getTargetConstant(0, DL, MVT::i32)),
10121 0);
10122 }
10123
10124 case 32: {
10125 // mrs x1, TPIDR_EL0
10126 // movz x0, #:tprel_g1:a
10127 // movk x0, #:tprel_g0_nc:a
10128 // add x0, x1, x0
10129 SDValue HiVar = DAG.getTargetGlobalAddress(
10130 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10131 SDValue LoVar = DAG.getTargetGlobalAddress(
10132 GV, DL, PtrVT, 0,
10134 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10135 DAG.getTargetConstant(16, DL, MVT::i32)),
10136 0);
10137 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10138 DAG.getTargetConstant(0, DL, MVT::i32)),
10139 0);
10140 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10141 }
10142
10143 case 48: {
10144 // mrs x1, TPIDR_EL0
10145 // movz x0, #:tprel_g2:a
10146 // movk x0, #:tprel_g1_nc:a
10147 // movk x0, #:tprel_g0_nc:a
10148 // add x0, x1, x0
10149 SDValue HiVar = DAG.getTargetGlobalAddress(
10150 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10151 SDValue MiVar = DAG.getTargetGlobalAddress(
10152 GV, DL, PtrVT, 0,
10154 SDValue LoVar = DAG.getTargetGlobalAddress(
10155 GV, DL, PtrVT, 0,
10157 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10158 DAG.getTargetConstant(32, DL, MVT::i32)),
10159 0);
10160 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10161 DAG.getTargetConstant(16, DL, MVT::i32)),
10162 0);
10163 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10164 DAG.getTargetConstant(0, DL, MVT::i32)),
10165 0);
10166 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10167 }
10168 }
10169}
10170
10171/// When accessing thread-local variables under either the general-dynamic or
10172/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10173/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10174/// is a function pointer to carry out the resolution.
10175///
10176/// The sequence is:
10177/// adrp x0, :tlsdesc:var
10178/// ldr x1, [x0, #:tlsdesc_lo12:var]
10179/// add x0, x0, #:tlsdesc_lo12:var
10180/// .tlsdesccall var
10181/// blr x1
10182/// (TPIDR_EL0 offset now in x0)
10183///
10184/// The above sequence must be produced unscheduled, to enable the linker to
10185/// optimize/relax this sequence.
10186/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10187/// above sequence, and expanded really late in the compilation flow, to ensure
10188/// the sequence is produced as per above.
10189SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10190 const SDLoc &DL,
10191 SelectionDAG &DAG) const {
10192 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10193
10194 SDValue Chain = DAG.getEntryNode();
10195 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10196
10197 unsigned Opcode =
10198 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10201 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10202 SDValue Glue = Chain.getValue(1);
10203
10204 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10205}
10206
10207SDValue
10208AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10209 SelectionDAG &DAG) const {
10210 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10211
10212 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10213 AArch64FunctionInfo *MFI =
10215
10219
10221 if (Model == TLSModel::LocalDynamic)
10223 }
10224
10226 Model != TLSModel::LocalExec)
10227 report_fatal_error("ELF TLS only supported in small memory model or "
10228 "in local exec TLS model");
10229 // Different choices can be made for the maximum size of the TLS area for a
10230 // module. For the small address model, the default TLS size is 16MiB and the
10231 // maximum TLS size is 4GiB.
10232 // FIXME: add tiny and large code model support for TLS access models other
10233 // than local exec. We currently generate the same code as small for tiny,
10234 // which may be larger than needed.
10235
10236 SDValue TPOff;
10237 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10238 SDLoc DL(Op);
10239 const GlobalValue *GV = GA->getGlobal();
10240
10241 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10242
10243 if (Model == TLSModel::LocalExec) {
10244 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10245 } else if (Model == TLSModel::InitialExec) {
10246 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10247 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10248 } else if (Model == TLSModel::LocalDynamic) {
10249 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10250 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10251 // the beginning of the module's TLS region, followed by a DTPREL offset
10252 // calculation.
10253
10254 // These accesses will need deduplicating if there's more than one.
10256
10257 // The call needs a relocation too for linker relaxation. It doesn't make
10258 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10259 // the address.
10260 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10262
10263 // Now we can calculate the offset from TPIDR_EL0 to this module's
10264 // thread-local area.
10265 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10266
10267 // Now use :dtprel_whatever: operations to calculate this variable's offset
10268 // in its thread-storage area.
10269 SDValue HiVar = DAG.getTargetGlobalAddress(
10270 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10271 SDValue LoVar = DAG.getTargetGlobalAddress(
10272 GV, DL, MVT::i64, 0,
10274
10275 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10276 DAG.getTargetConstant(0, DL, MVT::i32)),
10277 0);
10278 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10279 DAG.getTargetConstant(0, DL, MVT::i32)),
10280 0);
10281 } else if (Model == TLSModel::GeneralDynamic) {
10282 // The call needs a relocation too for linker relaxation. It doesn't make
10283 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10284 // the address.
10285 SDValue SymAddr =
10286 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10287
10288 // Finally we can make a call to calculate the offset from tpidr_el0.
10289 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10290 } else
10291 llvm_unreachable("Unsupported ELF TLS access model");
10292
10293 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10294}
10295
10296SDValue
10297AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10298 SelectionDAG &DAG) const {
10299 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10300
10301 SDValue Chain = DAG.getEntryNode();
10302 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10303 SDLoc DL(Op);
10304
10305 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10306
10307 // Load the ThreadLocalStoragePointer from the TEB
10308 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10309 SDValue TLSArray =
10310 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10311 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10312 Chain = TLSArray.getValue(1);
10313
10314 // Load the TLS index from the C runtime;
10315 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10316 // This also does the same as LOADgot, but using a generic i32 load,
10317 // while LOADgot only loads i64.
10318 SDValue TLSIndexHi =
10319 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10320 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10321 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10322 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10323 SDValue TLSIndex =
10324 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10325 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10326 Chain = TLSIndex.getValue(1);
10327
10328 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10329 // offset into the TLSArray.
10330 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10331 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10332 DAG.getConstant(3, DL, PtrVT));
10333 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10334 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10336 Chain = TLS.getValue(1);
10337
10338 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10339 const GlobalValue *GV = GA->getGlobal();
10340 SDValue TGAHi = DAG.getTargetGlobalAddress(
10341 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10342 SDValue TGALo = DAG.getTargetGlobalAddress(
10343 GV, DL, PtrVT, 0,
10345
10346 // Add the offset from the start of the .tls section (section base).
10347 SDValue Addr =
10348 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10349 DAG.getTargetConstant(0, DL, MVT::i32)),
10350 0);
10351 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10352 return Addr;
10353}
10354
10355SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10356 SelectionDAG &DAG) const {
10357 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10358 if (DAG.getTarget().useEmulatedTLS())
10359 return LowerToTLSEmulatedModel(GA, DAG);
10360
10361 if (Subtarget->isTargetDarwin())
10362 return LowerDarwinGlobalTLSAddress(Op, DAG);
10363 if (Subtarget->isTargetELF())
10364 return LowerELFGlobalTLSAddress(Op, DAG);
10365 if (Subtarget->isTargetWindows())
10366 return LowerWindowsGlobalTLSAddress(Op, DAG);
10367
10368 llvm_unreachable("Unexpected platform trying to use TLS");
10369}
10370
10371//===----------------------------------------------------------------------===//
10372// PtrAuthGlobalAddress lowering
10373//
10374// We have 3 lowering alternatives to choose from:
10375// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10376// If the GV doesn't need a GOT load (i.e., is locally defined)
10377// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10378//
10379// - LOADgotPAC: similar to LOADgot, with added PAC.
10380// If the GV needs a GOT load, materialize the pointer using the usual
10381// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10382// section is assumed to be read-only (for example, via relro mechanism). See
10383// LowerMOVaddrPAC.
10384//
10385// - LOADauthptrstatic: similar to LOADgot, but use a
10386// special stub slot instead of a GOT slot.
10387// Load a signed pointer for symbol 'sym' from a stub slot named
10388// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10389// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10390// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10391//
10392// All 3 are pseudos that are expand late to longer sequences: this lets us
10393// provide integrity guarantees on the to-be-signed intermediate values.
10394//
10395// LOADauthptrstatic is undesirable because it requires a large section filled
10396// with often similarly-signed pointers, making it a good harvesting target.
10397// Thus, it's only used for ptrauth references to extern_weak to avoid null
10398// checks.
10399
10401 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10402 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10403 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10404 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10405
10406 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10407 // offset alone as a pointer if the symbol wasn't available, which would
10408 // probably break null checks in users. Ptrauth complicates things further:
10409 // error out.
10410 if (TGN->getOffset() != 0)
10412 "unsupported non-zero offset in weak ptrauth global reference");
10413
10414 if (!isNullConstant(AddrDiscriminator))
10415 report_fatal_error("unsupported weak addr-div ptrauth global");
10416
10417 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10418 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10419 {TGA, Key, Discriminator}),
10420 0);
10421}
10422
10423SDValue
10424AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10425 SelectionDAG &DAG) const {
10426 SDValue Ptr = Op.getOperand(0);
10427 uint64_t KeyC = Op.getConstantOperandVal(1);
10428 SDValue AddrDiscriminator = Op.getOperand(2);
10429 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10430 EVT VT = Op.getValueType();
10431 SDLoc DL(Op);
10432
10433 if (KeyC > AArch64PACKey::LAST)
10434 report_fatal_error("key in ptrauth global out of range [0, " +
10435 Twine((int)AArch64PACKey::LAST) + "]");
10436
10437 // Blend only works if the integer discriminator is 16-bit wide.
10438 if (!isUInt<16>(DiscriminatorC))
10440 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10441
10442 // Choosing between 3 lowering alternatives is target-specific.
10443 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10444 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10445
10446 int64_t PtrOffsetC = 0;
10447 if (Ptr.getOpcode() == ISD::ADD) {
10448 PtrOffsetC = Ptr.getConstantOperandVal(1);
10449 Ptr = Ptr.getOperand(0);
10450 }
10451 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10452 const GlobalValue *PtrGV = PtrN->getGlobal();
10453
10454 // Classify the reference to determine whether it needs a GOT load.
10455 const unsigned OpFlags =
10456 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10457 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10458 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10459 "unsupported non-GOT op flags on ptrauth global reference");
10460
10461 // Fold any offset into the GV; our pseudos expect it there.
10462 PtrOffsetC += PtrN->getOffset();
10463 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10464 /*TargetFlags=*/0);
10465 assert(PtrN->getTargetFlags() == 0 &&
10466 "unsupported target flags on ptrauth global");
10467
10468 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10469 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10470 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10471 ? AddrDiscriminator
10472 : DAG.getRegister(AArch64::XZR, MVT::i64);
10473
10474 // No GOT load needed -> MOVaddrPAC
10475 if (!NeedsGOTLoad) {
10476 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10477 return SDValue(
10478 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10479 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10480 0);
10481 }
10482
10483 // GOT load -> LOADgotPAC
10484 // Note that we disallow extern_weak refs to avoid null checks later.
10485 if (!PtrGV->hasExternalWeakLinkage())
10486 return SDValue(
10487 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10488 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10489 0);
10490
10491 // extern_weak ref -> LOADauthptrstatic
10493 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10494 DAG);
10495}
10496
10497// Looks through \param Val to determine the bit that can be used to
10498// check the sign of the value. It returns the unextended value and
10499// the sign bit position.
10500std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10501 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10502 return {Val.getOperand(0),
10503 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10504 1};
10505
10506 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10507 return {Val.getOperand(0),
10508 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10509
10510 return {Val, Val.getValueSizeInBits() - 1};
10511}
10512
10513SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10514 SDValue Chain = Op.getOperand(0);
10515 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10516 SDValue LHS = Op.getOperand(2);
10517 SDValue RHS = Op.getOperand(3);
10518 SDValue Dest = Op.getOperand(4);
10519 SDLoc dl(Op);
10520
10522 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10523 // will not be produced, as they are conditional branch instructions that do
10524 // not set flags.
10525 bool ProduceNonFlagSettingCondBr =
10526 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10527
10528 // Handle f128 first, since lowering it will result in comparing the return
10529 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10530 // is expecting to deal with.
10531 if (LHS.getValueType() == MVT::f128) {
10532 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10533
10534 // If softenSetCCOperands returned a scalar, we need to compare the result
10535 // against zero to select between true and false values.
10536 if (!RHS.getNode()) {
10537 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10538 CC = ISD::SETNE;
10539 }
10540 }
10541
10542 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10543 // instruction.
10544 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10545 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10546 // Only lower legal XALUO ops.
10547 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10548 return SDValue();
10549
10550 // The actual operation with overflow check.
10552 SDValue Value, Overflow;
10553 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10554
10555 if (CC == ISD::SETNE)
10556 OFCC = getInvertedCondCode(OFCC);
10557 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
10558
10559 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10560 Overflow);
10561 }
10562
10563 if (LHS.getValueType().isInteger()) {
10564 assert((LHS.getValueType() == RHS.getValueType()) &&
10565 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10566
10567 // If the RHS of the comparison is zero, we can potentially fold this
10568 // to a specialized branch.
10569 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10570 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10571 if (CC == ISD::SETEQ) {
10572 // See if we can use a TBZ to fold in an AND as well.
10573 // TBZ has a smaller branch displacement than CBZ. If the offset is
10574 // out of bounds, a late MI-layer pass rewrites branches.
10575 // 403.gcc is an example that hits this case.
10576 if (LHS.getOpcode() == ISD::AND &&
10577 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10578 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10579 SDValue Test = LHS.getOperand(0);
10580 uint64_t Mask = LHS.getConstantOperandVal(1);
10581 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
10582 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10583 Dest);
10584 }
10585
10586 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
10587 } else if (CC == ISD::SETNE) {
10588 // See if we can use a TBZ to fold in an AND as well.
10589 // TBZ has a smaller branch displacement than CBZ. If the offset is
10590 // out of bounds, a late MI-layer pass rewrites branches.
10591 // 403.gcc is an example that hits this case.
10592 if (LHS.getOpcode() == ISD::AND &&
10593 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10594 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10595 SDValue Test = LHS.getOperand(0);
10596 uint64_t Mask = LHS.getConstantOperandVal(1);
10597 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
10598 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10599 Dest);
10600 }
10601
10602 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
10603 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10604 // Don't combine AND since emitComparison converts the AND to an ANDS
10605 // (a.k.a. TST) and the test in the test bit and branch instruction
10606 // becomes redundant. This would also increase register pressure.
10607 uint64_t SignBitPos;
10608 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10609 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
10610 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10611 }
10612 }
10613 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10614 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10615 // Don't combine AND since emitComparison converts the AND to an ANDS
10616 // (a.k.a. TST) and the test in the test bit and branch instruction
10617 // becomes redundant. This would also increase register pressure.
10618 uint64_t SignBitPos;
10619 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10620 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
10621 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10622 }
10623
10624 SDValue CCVal;
10625 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10626 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10627 Cmp);
10628 }
10629
10630 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10631 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10632
10633 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10634 // clean. Some of them require two branches to implement.
10635 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10636 AArch64CC::CondCode CC1, CC2;
10637 changeFPCCToAArch64CC(CC, CC1, CC2);
10638 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10639 SDValue BR1 =
10640 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
10641 if (CC2 != AArch64CC::AL) {
10642 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10643 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
10644 Cmp);
10645 }
10646
10647 return BR1;
10648}
10649
10650SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10651 SelectionDAG &DAG) const {
10652 if (!Subtarget->isNeonAvailable() &&
10653 !Subtarget->useSVEForFixedLengthVectors())
10654 return SDValue();
10655
10656 EVT VT = Op.getValueType();
10657 EVT IntVT = VT.changeTypeToInteger();
10658 SDLoc DL(Op);
10659
10660 SDValue In1 = Op.getOperand(0);
10661 SDValue In2 = Op.getOperand(1);
10662 EVT SrcVT = In2.getValueType();
10663
10664 if (!SrcVT.bitsEq(VT))
10665 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10666
10667 if (VT.isScalableVector())
10668 IntVT =
10670
10671 if (VT.isFixedLengthVector() &&
10672 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10673 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10674
10675 In1 = convertToScalableVector(DAG, ContainerVT, In1);
10676 In2 = convertToScalableVector(DAG, ContainerVT, In2);
10677
10678 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10679 return convertFromScalableVector(DAG, VT, Res);
10680 }
10681
10682 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10683 if (VT.isScalableVector())
10684 return getSVESafeBitCast(VT, Op, DAG);
10685
10686 return DAG.getBitcast(VT, Op);
10687 };
10688
10689 SDValue VecVal1, VecVal2;
10690 EVT VecVT;
10691 auto SetVecVal = [&](int Idx = -1) {
10692 if (!VT.isVector()) {
10693 VecVal1 =
10694 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10695 VecVal2 =
10696 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10697 } else {
10698 VecVal1 = BitCast(VecVT, In1, DAG);
10699 VecVal2 = BitCast(VecVT, In2, DAG);
10700 }
10701 };
10702 if (VT.isVector()) {
10703 VecVT = IntVT;
10704 SetVecVal();
10705 } else if (VT == MVT::f64) {
10706 VecVT = MVT::v2i64;
10707 SetVecVal(AArch64::dsub);
10708 } else if (VT == MVT::f32) {
10709 VecVT = MVT::v4i32;
10710 SetVecVal(AArch64::ssub);
10711 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10712 VecVT = MVT::v8i16;
10713 SetVecVal(AArch64::hsub);
10714 } else {
10715 llvm_unreachable("Invalid type for copysign!");
10716 }
10717
10718 unsigned BitWidth = In1.getScalarValueSizeInBits();
10719 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10720
10721 // We want to materialize a mask with every bit but the high bit set, but the
10722 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10723 // 64-bit elements. Instead, materialize all bits set and then negate that.
10724 if (VT == MVT::f64 || VT == MVT::v2f64) {
10725 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10726 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10727 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10728 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10729 }
10730
10731 SDValue BSP =
10732 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10733 if (VT == MVT::f16 || VT == MVT::bf16)
10734 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10735 if (VT == MVT::f32)
10736 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10737 if (VT == MVT::f64)
10738 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10739
10740 return BitCast(VT, BSP, DAG);
10741}
10742
10743SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10744 SelectionDAG &DAG) const {
10746 Attribute::NoImplicitFloat))
10747 return SDValue();
10748
10749 EVT VT = Op.getValueType();
10750 if (VT.isScalableVector() ||
10752 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10753
10754 if (!Subtarget->isNeonAvailable())
10755 return SDValue();
10756
10757 bool IsParity = Op.getOpcode() == ISD::PARITY;
10758 SDValue Val = Op.getOperand(0);
10759 SDLoc DL(Op);
10760
10761 // for i32, general parity function using EORs is more efficient compared to
10762 // using floating point
10763 if (VT == MVT::i32 && IsParity)
10764 return SDValue();
10765
10766 // If there is no CNT instruction available, GPR popcount can
10767 // be more efficiently lowered to the following sequence that uses
10768 // AdvSIMD registers/instructions as long as the copies to/from
10769 // the AdvSIMD registers are cheap.
10770 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10771 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10772 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10773 // FMOV X0, D0 // copy result back to integer reg
10774 if (VT == MVT::i32 || VT == MVT::i64) {
10775 if (VT == MVT::i32)
10776 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10777 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10778
10779 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10780 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10781 if (VT == MVT::i32)
10782 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
10783 DAG.getConstant(0, DL, MVT::i64));
10784 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10785 if (IsParity)
10786 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10787 return AddV;
10788 } else if (VT == MVT::i128) {
10789 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10790
10791 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10792 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10793 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10794 if (IsParity)
10795 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10796 return AddV;
10797 }
10798
10799 assert(!IsParity && "ISD::PARITY of vector types not supported");
10800
10801 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10802 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10803 "Unexpected type for custom ctpop lowering");
10804
10805 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10806 Val = DAG.getBitcast(VT8Bit, Val);
10807 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10808
10809 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10810 VT.getVectorNumElements() >= 2) {
10811 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10812 SDValue Zeros = DAG.getConstant(0, DL, DT);
10813 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10814
10815 if (VT == MVT::v2i64) {
10816 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10817 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10818 } else if (VT == MVT::v2i32) {
10819 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10820 } else if (VT == MVT::v4i32) {
10821 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10822 } else {
10823 llvm_unreachable("Unexpected type for custom ctpop lowering");
10824 }
10825
10826 return Val;
10827 }
10828
10829 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10830 unsigned EltSize = 8;
10831 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10832 while (EltSize != VT.getScalarSizeInBits()) {
10833 EltSize *= 2;
10834 NumElts /= 2;
10835 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
10836 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
10837 }
10838
10839 return Val;
10840}
10841
10842SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10843 EVT VT = Op.getValueType();
10844 assert(VT.isScalableVector() ||
10846 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10847
10848 SDLoc DL(Op);
10849 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
10850 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
10851}
10852
10853SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10854 SelectionDAG &DAG) const {
10855
10856 EVT VT = Op.getValueType();
10857 SDLoc DL(Op);
10858 unsigned Opcode = Op.getOpcode();
10860 switch (Opcode) {
10861 default:
10862 llvm_unreachable("Wrong instruction");
10863 case ISD::SMAX:
10864 CC = ISD::SETGT;
10865 break;
10866 case ISD::SMIN:
10867 CC = ISD::SETLT;
10868 break;
10869 case ISD::UMAX:
10870 CC = ISD::SETUGT;
10871 break;
10872 case ISD::UMIN:
10873 CC = ISD::SETULT;
10874 break;
10875 }
10876
10877 if (VT.isScalableVector() ||
10879 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10880 switch (Opcode) {
10881 default:
10882 llvm_unreachable("Wrong instruction");
10883 case ISD::SMAX:
10884 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10885 case ISD::SMIN:
10886 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10887 case ISD::UMAX:
10888 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10889 case ISD::UMIN:
10890 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10891 }
10892 }
10893
10894 SDValue Op0 = Op.getOperand(0);
10895 SDValue Op1 = Op.getOperand(1);
10896 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10897 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10898}
10899
10900SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10901 SelectionDAG &DAG) const {
10902 EVT VT = Op.getValueType();
10903
10904 if (VT.isScalableVector() ||
10906 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10907 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10908
10909 SDLoc DL(Op);
10910 SDValue REVB;
10911 MVT VST;
10912
10913 switch (VT.getSimpleVT().SimpleTy) {
10914 default:
10915 llvm_unreachable("Invalid type for bitreverse!");
10916
10917 case MVT::v2i32: {
10918 VST = MVT::v8i8;
10919 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10920
10921 break;
10922 }
10923
10924 case MVT::v4i32: {
10925 VST = MVT::v16i8;
10926 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10927
10928 break;
10929 }
10930
10931 case MVT::v1i64: {
10932 VST = MVT::v8i8;
10933 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10934
10935 break;
10936 }
10937
10938 case MVT::v2i64: {
10939 VST = MVT::v16i8;
10940 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10941
10942 break;
10943 }
10944 }
10945
10946 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10947 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10948}
10949
10950// Check whether the continuous comparison sequence.
10951static bool
10952isOrXorChain(SDValue N, unsigned &Num,
10953 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10954 if (Num == MaxXors)
10955 return false;
10956
10957 // Skip the one-use zext
10958 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10959 N = N->getOperand(0);
10960
10961 // The leaf node must be XOR
10962 if (N->getOpcode() == ISD::XOR) {
10963 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10964 Num++;
10965 return true;
10966 }
10967
10968 // All the non-leaf nodes must be OR.
10969 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10970 return false;
10971
10972 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10973 isOrXorChain(N->getOperand(1), Num, WorkList))
10974 return true;
10975 return false;
10976}
10977
10978// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10980 SDValue LHS = N->getOperand(0);
10981 SDValue RHS = N->getOperand(1);
10982 SDLoc DL(N);
10983 EVT VT = N->getValueType(0);
10985
10986 // Only handle integer compares.
10987 if (N->getOpcode() != ISD::SETCC)
10988 return SDValue();
10989
10990 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10991 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10992 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10993 unsigned NumXors = 0;
10994 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10995 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10996 isOrXorChain(LHS, NumXors, WorkList)) {
10997 SDValue XOR0, XOR1;
10998 std::tie(XOR0, XOR1) = WorkList[0];
10999 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11000 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11001 for (unsigned I = 1; I < WorkList.size(); I++) {
11002 std::tie(XOR0, XOR1) = WorkList[I];
11003 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11004 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11005 }
11006
11007 // Exit early by inverting the condition, which help reduce indentations.
11008 return Cmp;
11009 }
11010
11011 return SDValue();
11012}
11013
11014SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11015
11016 if (Op.getValueType().isVector())
11017 return LowerVSETCC(Op, DAG);
11018
11019 bool IsStrict = Op->isStrictFPOpcode();
11020 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11021 unsigned OpNo = IsStrict ? 1 : 0;
11022 SDValue Chain;
11023 if (IsStrict)
11024 Chain = Op.getOperand(0);
11025 SDValue LHS = Op.getOperand(OpNo + 0);
11026 SDValue RHS = Op.getOperand(OpNo + 1);
11027 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11028 SDLoc dl(Op);
11029
11030 // We chose ZeroOrOneBooleanContents, so use zero and one.
11031 EVT VT = Op.getValueType();
11032 SDValue TVal = DAG.getConstant(1, dl, VT);
11033 SDValue FVal = DAG.getConstant(0, dl, VT);
11034
11035 // Handle f128 first, since one possible outcome is a normal integer
11036 // comparison which gets picked up by the next if statement.
11037 if (LHS.getValueType() == MVT::f128) {
11038 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
11039 IsSignaling);
11040
11041 // If softenSetCCOperands returned a scalar, use it.
11042 if (!RHS.getNode()) {
11043 assert(LHS.getValueType() == Op.getValueType() &&
11044 "Unexpected setcc expansion!");
11045 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
11046 }
11047 }
11048
11049 if (LHS.getValueType().isInteger()) {
11050
11051 simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
11052
11053 SDValue CCVal;
11055 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
11056
11057 // Note that we inverted the condition above, so we reverse the order of
11058 // the true and false operands here. This will allow the setcc to be
11059 // matched to a single CSINC instruction.
11060 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
11061 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
11062 }
11063
11064 // Now we know we're dealing with FP values.
11065 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11066 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11067
11068 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11069 // and do the comparison.
11070 SDValue Cmp;
11071 if (IsStrict)
11072 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
11073 else
11074 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11075
11076 AArch64CC::CondCode CC1, CC2;
11077 changeFPCCToAArch64CC(CC, CC1, CC2);
11078 SDValue Res;
11079 if (CC2 == AArch64CC::AL) {
11080 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11081 CC2);
11082 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11083
11084 // Note that we inverted the condition above, so we reverse the order of
11085 // the true and false operands here. This will allow the setcc to be
11086 // matched to a single CSINC instruction.
11087 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
11088 } else {
11089 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11090 // totally clean. Some of them require two CSELs to implement. As is in
11091 // this case, we emit the first CSEL and then emit a second using the output
11092 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11093
11094 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11095 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11096 SDValue CS1 =
11097 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11098
11099 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11100 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11101 }
11102 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
11103}
11104
11105SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11106 SelectionDAG &DAG) const {
11107
11108 SDValue LHS = Op.getOperand(0);
11109 SDValue RHS = Op.getOperand(1);
11110 EVT VT = LHS.getValueType();
11111 if (VT != MVT::i32 && VT != MVT::i64)
11112 return SDValue();
11113
11114 SDLoc DL(Op);
11115 SDValue Carry = Op.getOperand(2);
11116 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11117 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11118 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
11119 LHS, RHS, InvCarry);
11120
11121 EVT OpVT = Op.getValueType();
11122 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11123 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11124
11125 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11127 SDValue CCVal =
11128 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
11129 // Inputs are swapped because the condition is inverted. This will allow
11130 // matching with a single CSINC instruction.
11131 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11132 Cmp.getValue(1));
11133}
11134
11135SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
11136 SDValue RHS, SDValue TVal,
11137 SDValue FVal, const SDLoc &dl,
11138 SelectionDAG &DAG) const {
11139 // Handle f128 first, because it will result in a comparison of some RTLIB
11140 // call result against zero.
11141 if (LHS.getValueType() == MVT::f128) {
11142 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
11143
11144 // If softenSetCCOperands returned a scalar, we need to compare the result
11145 // against zero to select between true and false values.
11146 if (!RHS.getNode()) {
11147 RHS = DAG.getConstant(0, dl, LHS.getValueType());
11148 CC = ISD::SETNE;
11149 }
11150 }
11151
11152 // Also handle f16, for which we need to do a f32 comparison.
11153 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11154 LHS.getValueType() == MVT::bf16) {
11155 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
11156 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
11157 }
11158
11159 // Next, handle integers.
11160 if (LHS.getValueType().isInteger()) {
11161 assert((LHS.getValueType() == RHS.getValueType()) &&
11162 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11163
11164 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11165 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11166 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11167 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11168 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11169 // supported types.
11170 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11171 CTVal->isOne() && CFVal->isAllOnes() &&
11172 LHS.getValueType() == TVal.getValueType()) {
11173 EVT VT = LHS.getValueType();
11174 SDValue Shift =
11175 DAG.getNode(ISD::SRA, dl, VT, LHS,
11176 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11177 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
11178 }
11179
11180 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11181 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11182 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11183 // Both require less instructions than compare and conditional select.
11184 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11185 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11186 LHS.getValueType() == RHS.getValueType()) {
11187 EVT VT = LHS.getValueType();
11188 SDValue Shift =
11189 DAG.getNode(ISD::SRA, dl, VT, LHS,
11190 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11191
11192 if (CC == ISD::SETGT)
11193 Shift = DAG.getNOT(dl, Shift, VT);
11194
11195 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
11196 }
11197
11198 unsigned Opcode = AArch64ISD::CSEL;
11199
11200 // If both the TVal and the FVal are constants, see if we can swap them in
11201 // order to for a CSINV or CSINC out of them.
11202 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11203 std::swap(TVal, FVal);
11204 std::swap(CTVal, CFVal);
11205 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11206 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11207 std::swap(TVal, FVal);
11208 std::swap(CTVal, CFVal);
11209 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11210 } else if (TVal.getOpcode() == ISD::XOR) {
11211 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11212 // with a CSINV rather than a CSEL.
11213 if (isAllOnesConstant(TVal.getOperand(1))) {
11214 std::swap(TVal, FVal);
11215 std::swap(CTVal, CFVal);
11216 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11217 }
11218 } else if (TVal.getOpcode() == ISD::SUB) {
11219 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11220 // that we can match with a CSNEG rather than a CSEL.
11221 if (isNullConstant(TVal.getOperand(0))) {
11222 std::swap(TVal, FVal);
11223 std::swap(CTVal, CFVal);
11224 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11225 }
11226 } else if (CTVal && CFVal) {
11227 const int64_t TrueVal = CTVal->getSExtValue();
11228 const int64_t FalseVal = CFVal->getSExtValue();
11229 bool Swap = false;
11230
11231 // If both TVal and FVal are constants, see if FVal is the
11232 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11233 // instead of a CSEL in that case.
11234 if (TrueVal == ~FalseVal) {
11235 Opcode = AArch64ISD::CSINV;
11236 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11237 TrueVal == -FalseVal) {
11238 Opcode = AArch64ISD::CSNEG;
11239 } else if (TVal.getValueType() == MVT::i32) {
11240 // If our operands are only 32-bit wide, make sure we use 32-bit
11241 // arithmetic for the check whether we can use CSINC. This ensures that
11242 // the addition in the check will wrap around properly in case there is
11243 // an overflow (which would not be the case if we do the check with
11244 // 64-bit arithmetic).
11245 const uint32_t TrueVal32 = CTVal->getZExtValue();
11246 const uint32_t FalseVal32 = CFVal->getZExtValue();
11247
11248 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11249 Opcode = AArch64ISD::CSINC;
11250
11251 if (TrueVal32 > FalseVal32) {
11252 Swap = true;
11253 }
11254 }
11255 } else {
11256 // 64-bit check whether we can use CSINC.
11257 const uint64_t TrueVal64 = TrueVal;
11258 const uint64_t FalseVal64 = FalseVal;
11259
11260 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11261 Opcode = AArch64ISD::CSINC;
11262
11263 if (TrueVal > FalseVal) {
11264 Swap = true;
11265 }
11266 }
11267 }
11268
11269 // Swap TVal and FVal if necessary.
11270 if (Swap) {
11271 std::swap(TVal, FVal);
11272 std::swap(CTVal, CFVal);
11273 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11274 }
11275
11276 if (Opcode != AArch64ISD::CSEL) {
11277 // Drop FVal since we can get its value by simply inverting/negating
11278 // TVal.
11279 FVal = TVal;
11280 }
11281 }
11282
11283 // Avoid materializing a constant when possible by reusing a known value in
11284 // a register. However, don't perform this optimization if the known value
11285 // is one, zero or negative one in the case of a CSEL. We can always
11286 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11287 // FVal, respectively.
11288 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11289 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11290 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11292 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11293 // "a != C ? x : a" to avoid materializing C.
11294 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11295 TVal = LHS;
11296 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11297 FVal = LHS;
11298 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11299 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11300 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11301 // avoid materializing C.
11303 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11304 Opcode = AArch64ISD::CSINV;
11305 TVal = LHS;
11306 FVal = DAG.getConstant(0, dl, FVal.getValueType());
11307 }
11308 }
11309
11310 SDValue CCVal;
11311 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
11312 EVT VT = TVal.getValueType();
11313 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
11314 }
11315
11316 // Now we know we're dealing with FP values.
11317 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11318 LHS.getValueType() == MVT::f64);
11319 assert(LHS.getValueType() == RHS.getValueType());
11320 EVT VT = TVal.getValueType();
11321 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11322
11323 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11324 // clean. Some of them require two CSELs to implement.
11325 AArch64CC::CondCode CC1, CC2;
11326 changeFPCCToAArch64CC(CC, CC1, CC2);
11327
11328 if (DAG.getTarget().Options.UnsafeFPMath) {
11329 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11330 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11331 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11332 if (RHSVal && RHSVal->isZero()) {
11333 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11334 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11335
11336 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11337 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11338 TVal = LHS;
11339 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11340 CFVal && CFVal->isZero() &&
11341 FVal.getValueType() == LHS.getValueType())
11342 FVal = LHS;
11343 }
11344 }
11345
11346 // Emit first, and possibly only, CSEL.
11347 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11348 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11349
11350 // If we need a second CSEL, emit it, using the output of the first as the
11351 // RHS. We're effectively OR'ing the two CC's together.
11352 if (CC2 != AArch64CC::AL) {
11353 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11354 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11355 }
11356
11357 // Otherwise, return the output of the first CSEL.
11358 return CS1;
11359}
11360
11361SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11362 SelectionDAG &DAG) const {
11363 EVT Ty = Op.getValueType();
11364 auto Idx = Op.getConstantOperandAPInt(2);
11365 int64_t IdxVal = Idx.getSExtValue();
11366 assert(Ty.isScalableVector() &&
11367 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11368
11369 // We can use the splice instruction for certain index values where we are
11370 // able to efficiently generate the correct predicate. The index will be
11371 // inverted and used directly as the input to the ptrue instruction, i.e.
11372 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11373 // splice predicate. However, we can only do this if we can guarantee that
11374 // there are enough elements in the vector, hence we check the index <= min
11375 // number of elements.
11376 std::optional<unsigned> PredPattern;
11377 if (Ty.isScalableVector() && IdxVal < 0 &&
11378 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11379 std::nullopt) {
11380 SDLoc DL(Op);
11381
11382 // Create a predicate where all but the last -IdxVal elements are false.
11383 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11384 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11385 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11386
11387 // Now splice the two inputs together using the predicate.
11388 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11389 Op.getOperand(1));
11390 }
11391
11392 // We can select to an EXT instruction when indexing the first 256 bytes.
11394 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11395 return Op;
11396
11397 return SDValue();
11398}
11399
11400SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11401 SelectionDAG &DAG) const {
11402 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11403 SDValue LHS = Op.getOperand(0);
11404 SDValue RHS = Op.getOperand(1);
11405 SDValue TVal = Op.getOperand(2);
11406 SDValue FVal = Op.getOperand(3);
11407 SDLoc DL(Op);
11408 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11409}
11410
11411SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11412 SelectionDAG &DAG) const {
11413 SDValue CCVal = Op->getOperand(0);
11414 SDValue TVal = Op->getOperand(1);
11415 SDValue FVal = Op->getOperand(2);
11416 SDLoc DL(Op);
11417
11418 EVT Ty = Op.getValueType();
11419 if (Ty == MVT::aarch64svcount) {
11420 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11421 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11422 SDValue Sel =
11423 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11424 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11425 }
11426
11427 if (Ty.isScalableVector()) {
11428 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11429 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11430 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11431 }
11432
11433 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11434 // FIXME: Ideally this would be the same as above using i1 types, however
11435 // for the moment we can't deal with fixed i1 vector types properly, so
11436 // instead extend the predicate to a result type sized integer vector.
11437 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11438 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11439 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11440 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11441 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11442 }
11443
11444 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11445 // instruction.
11446 if (ISD::isOverflowIntrOpRes(CCVal)) {
11447 // Only lower legal XALUO ops.
11448 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11449 return SDValue();
11450
11452 SDValue Value, Overflow;
11453 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11454 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
11455
11456 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11457 CCVal, Overflow);
11458 }
11459
11460 // Lower it the same way as we would lower a SELECT_CC node.
11462 SDValue LHS, RHS;
11463 if (CCVal.getOpcode() == ISD::SETCC) {
11464 LHS = CCVal.getOperand(0);
11465 RHS = CCVal.getOperand(1);
11466 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11467 } else {
11468 LHS = CCVal;
11469 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11470 CC = ISD::SETNE;
11471 }
11472
11473 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11474 // order to use FCSELSrrr
11475 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11476 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11477 DAG.getUNDEF(MVT::f32), TVal);
11478 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11479 DAG.getUNDEF(MVT::f32), FVal);
11480 }
11481
11482 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11483
11484 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11485 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11486 }
11487
11488 return Res;
11489}
11490
11491SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11492 SelectionDAG &DAG) const {
11493 // Jump table entries as PC relative offsets. No additional tweaking
11494 // is necessary here. Just get the address of the jump table.
11495 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11496
11499 !Subtarget->isTargetMachO())
11500 return getAddrLarge(JT, DAG);
11501 if (CM == CodeModel::Tiny)
11502 return getAddrTiny(JT, DAG);
11503 return getAddr(JT, DAG);
11504}
11505
11506SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11507 SelectionDAG &DAG) const {
11508 // Jump table entries as PC relative offsets. No additional tweaking
11509 // is necessary here. Just get the address of the jump table.
11510 SDLoc DL(Op);
11511 SDValue JT = Op.getOperand(1);
11512 SDValue Entry = Op.getOperand(2);
11513 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11514
11515 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11516 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11517
11518 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11519 // sequence later, to guarantee the integrity of the intermediate values.
11521 "aarch64-jump-table-hardening")) {
11523 if (Subtarget->isTargetMachO()) {
11524 if (CM != CodeModel::Small && CM != CodeModel::Large)
11525 report_fatal_error("Unsupported code-model for hardened jump-table");
11526 } else {
11527 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11528 assert(Subtarget->isTargetELF() &&
11529 "jump table hardening only supported on MachO/ELF");
11530 if (CM != CodeModel::Small)
11531 report_fatal_error("Unsupported code-model for hardened jump-table");
11532 }
11533
11534 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11535 Entry, SDValue());
11536 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11537 DAG.getTargetJumpTable(JTI, MVT::i32),
11538 X16Copy.getValue(0), X16Copy.getValue(1));
11539 return SDValue(B, 0);
11540 }
11541
11542 SDNode *Dest =
11543 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11544 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11545 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11546 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11547}
11548
11549SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11550 SDValue Chain = Op.getOperand(0);
11551 SDValue Dest = Op.getOperand(1);
11552
11553 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11554 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11555 if (Dest->isMachineOpcode() &&
11556 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11557 return SDValue();
11558
11559 const MachineFunction &MF = DAG.getMachineFunction();
11560 std::optional<uint16_t> BADisc =
11562 if (!BADisc)
11563 return SDValue();
11564
11565 SDLoc DL(Op);
11566
11567 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11569 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11570
11571 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
11572 {Dest, Key, Disc, AddrDisc, Chain});
11573 return SDValue(BrA, 0);
11574}
11575
11576SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11577 SelectionDAG &DAG) const {
11578 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11580 if (CM == CodeModel::Large) {
11581 // Use the GOT for the large code model on iOS.
11582 if (Subtarget->isTargetMachO()) {
11583 return getGOT(CP, DAG);
11584 }
11586 return getAddrLarge(CP, DAG);
11587 } else if (CM == CodeModel::Tiny) {
11588 return getAddrTiny(CP, DAG);
11589 }
11590 return getAddr(CP, DAG);
11591}
11592
11593SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11594 SelectionDAG &DAG) const {
11595 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
11596 const BlockAddress *BA = BAN->getBlockAddress();
11597
11598 if (std::optional<uint16_t> BADisc =
11600 *BA->getFunction())) {
11601 SDLoc DL(Op);
11602
11603 // This isn't cheap, but BRIND is rare.
11604 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
11605
11606 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11607
11609 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11610
11611 SDNode *MOV =
11612 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
11613 {TargetBA, Key, AddrDisc, Disc});
11614 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
11615 SDValue(MOV, 1));
11616 }
11617
11619 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11621 return getAddrLarge(BAN, DAG);
11622 } else if (CM == CodeModel::Tiny) {
11623 return getAddrTiny(BAN, DAG);
11624 }
11625 return getAddr(BAN, DAG);
11626}
11627
11628SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11629 SelectionDAG &DAG) const {
11630 AArch64FunctionInfo *FuncInfo =
11632
11633 SDLoc DL(Op);
11634 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
11636 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
11637 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11638 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11639 MachinePointerInfo(SV));
11640}
11641
11642SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11643 SelectionDAG &DAG) const {
11646
11647 SDLoc DL(Op);
11648 SDValue FR;
11649 if (Subtarget->isWindowsArm64EC()) {
11650 // With the Arm64EC ABI, we compute the address of the varargs save area
11651 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11652 // but calls from an entry thunk can pass in a different address.
11653 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
11654 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
11656 if (FuncInfo->getVarArgsGPRSize() > 0)
11657 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11658 else
11659 StackOffset = FuncInfo->getVarArgsStackOffset();
11660 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
11661 DAG.getConstant(StackOffset, DL, MVT::i64));
11662 } else {
11663 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
11664 ? FuncInfo->getVarArgsGPRIndex()
11665 : FuncInfo->getVarArgsStackIndex(),
11667 }
11668 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11669 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11670 MachinePointerInfo(SV));
11671}
11672
11673SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11674 SelectionDAG &DAG) const {
11675 // The layout of the va_list struct is specified in the AArch64 Procedure Call
11676 // Standard, section B.3.
11679 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11680 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11681 auto PtrVT = getPointerTy(DAG.getDataLayout());
11682 SDLoc DL(Op);
11683
11684 SDValue Chain = Op.getOperand(0);
11685 SDValue VAList = Op.getOperand(1);
11686 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11688
11689 // void *__stack at offset 0
11690 unsigned Offset = 0;
11691 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
11692 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
11693 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
11694 MachinePointerInfo(SV), Align(PtrSize)));
11695
11696 // void *__gr_top at offset 8 (4 on ILP32)
11697 Offset += PtrSize;
11698 int GPRSize = FuncInfo->getVarArgsGPRSize();
11699 if (GPRSize > 0) {
11700 SDValue GRTop, GRTopAddr;
11701
11702 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11703 DAG.getConstant(Offset, DL, PtrVT));
11704
11705 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
11706 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
11707 DAG.getSignedConstant(GPRSize, DL, PtrVT));
11708 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
11709
11710 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
11712 Align(PtrSize)));
11713 }
11714
11715 // void *__vr_top at offset 16 (8 on ILP32)
11716 Offset += PtrSize;
11717 int FPRSize = FuncInfo->getVarArgsFPRSize();
11718 if (FPRSize > 0) {
11719 SDValue VRTop, VRTopAddr;
11720 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11721 DAG.getConstant(Offset, DL, PtrVT));
11722
11723 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
11724 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
11725 DAG.getSignedConstant(FPRSize, DL, PtrVT));
11726 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
11727
11728 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
11730 Align(PtrSize)));
11731 }
11732
11733 // int __gr_offs at offset 24 (12 on ILP32)
11734 Offset += PtrSize;
11735 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11736 DAG.getConstant(Offset, DL, PtrVT));
11737 MemOps.push_back(
11738 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
11739 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11740
11741 // int __vr_offs at offset 28 (16 on ILP32)
11742 Offset += 4;
11743 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11744 DAG.getConstant(Offset, DL, PtrVT));
11745 MemOps.push_back(
11746 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
11747 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11748
11749 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
11750}
11751
11752SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11753 SelectionDAG &DAG) const {
11755 Function &F = MF.getFunction();
11756
11757 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
11758 return LowerWin64_VASTART(Op, DAG);
11759 else if (Subtarget->isTargetDarwin())
11760 return LowerDarwin_VASTART(Op, DAG);
11761 else
11762 return LowerAAPCS_VASTART(Op, DAG);
11763}
11764
11765SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11766 SelectionDAG &DAG) const {
11767 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11768 // pointer.
11769 SDLoc DL(Op);
11770 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11771 unsigned VaListSize =
11772 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11773 ? PtrSize
11774 : Subtarget->isTargetILP32() ? 20 : 32;
11775 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
11776 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11777
11778 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
11779 DAG.getConstant(VaListSize, DL, MVT::i32),
11780 Align(PtrSize), false, false, /*CI=*/nullptr,
11781 std::nullopt, MachinePointerInfo(DestSV),
11782 MachinePointerInfo(SrcSV));
11783}
11784
11785SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11786 assert(Subtarget->isTargetDarwin() &&
11787 "automatic va_arg instruction only works on Darwin");
11788
11789 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11790 EVT VT = Op.getValueType();
11791 SDLoc DL(Op);
11792 SDValue Chain = Op.getOperand(0);
11793 SDValue Addr = Op.getOperand(1);
11794 MaybeAlign Align(Op.getConstantOperandVal(3));
11795 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11796 auto PtrVT = getPointerTy(DAG.getDataLayout());
11797 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11798 SDValue VAList =
11799 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
11800 Chain = VAList.getValue(1);
11801 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
11802
11803 if (VT.isScalableVector())
11804 report_fatal_error("Passing SVE types to variadic functions is "
11805 "currently not supported");
11806
11807 if (Align && *Align > MinSlotSize) {
11808 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11809 DAG.getConstant(Align->value() - 1, DL, PtrVT));
11810 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
11811 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
11812 }
11813
11814 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
11815 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
11816
11817 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11818 // up to 64 bits. At the very least, we have to increase the striding of the
11819 // vaargs list to match this, and for FP values we need to introduce
11820 // FP_ROUND nodes as well.
11821 if (VT.isInteger() && !VT.isVector())
11822 ArgSize = std::max(ArgSize, MinSlotSize);
11823 bool NeedFPTrunc = false;
11824 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11825 ArgSize = 8;
11826 NeedFPTrunc = true;
11827 }
11828
11829 // Increment the pointer, VAList, to the next vaarg
11830 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11831 DAG.getConstant(ArgSize, DL, PtrVT));
11832 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
11833
11834 // Store the incremented VAList to the legalized pointer
11835 SDValue APStore =
11836 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
11837
11838 // Load the actual argument out of the pointer VAList
11839 if (NeedFPTrunc) {
11840 // Load the value as an f64.
11841 SDValue WideFP =
11842 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
11843 // Round the value down to an f32.
11844 SDValue NarrowFP =
11845 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
11846 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
11847 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
11848 // Merge the rounded value with the chain output of the load.
11849 return DAG.getMergeValues(Ops, DL);
11850 }
11851
11852 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
11853}
11854
11855SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11856 SelectionDAG &DAG) const {
11858 MFI.setFrameAddressIsTaken(true);
11859
11860 EVT VT = Op.getValueType();
11861 SDLoc DL(Op);
11862 unsigned Depth = Op.getConstantOperandVal(0);
11863 SDValue FrameAddr =
11864 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
11865 while (Depth--)
11866 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
11868
11869 if (Subtarget->isTargetILP32())
11870 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
11871 DAG.getValueType(VT));
11872
11873 return FrameAddr;
11874}
11875
11876SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11877 SelectionDAG &DAG) const {
11879
11880 EVT VT = getPointerTy(DAG.getDataLayout());
11881 SDLoc DL(Op);
11882 int FI = MFI.CreateFixedObject(4, 0, false);
11883 return DAG.getFrameIndex(FI, VT);
11884}
11885
11886#define GET_REGISTER_MATCHER
11887#include "AArch64GenAsmMatcher.inc"
11888
11889// FIXME? Maybe this could be a TableGen attribute on some registers and
11890// this table could be generated automatically from RegInfo.
11891Register AArch64TargetLowering::
11892getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11894 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11895 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11896 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
11897 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
11898 !MRI->isReservedReg(MF, Reg))
11899 Reg = 0;
11900 }
11901 if (Reg)
11902 return Reg;
11903 report_fatal_error(Twine("Invalid register name \""
11904 + StringRef(RegName) + "\"."));
11905}
11906
11907SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11908 SelectionDAG &DAG) const {
11910
11911 EVT VT = Op.getValueType();
11912 SDLoc DL(Op);
11913
11914 SDValue FrameAddr =
11915 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
11917
11918 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
11919}
11920
11921SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11922 SelectionDAG &DAG) const {
11924 MachineFrameInfo &MFI = MF.getFrameInfo();
11925 MFI.setReturnAddressIsTaken(true);
11926
11927 EVT VT = Op.getValueType();
11928 SDLoc DL(Op);
11929 unsigned Depth = Op.getConstantOperandVal(0);
11930 SDValue ReturnAddress;
11931 if (Depth) {
11932 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11934 ReturnAddress = DAG.getLoad(
11935 VT, DL, DAG.getEntryNode(),
11936 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11937 } else {
11938 // Return LR, which contains the return address. Mark it an implicit
11939 // live-in.
11940 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11941 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11942 }
11943
11944 // The XPACLRI instruction assembles to a hint-space instruction before
11945 // Armv8.3-A therefore this instruction can be safely used for any pre
11946 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11947 // that instead.
11948 SDNode *St;
11949 if (Subtarget->hasPAuth()) {
11950 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11951 } else {
11952 // XPACLRI operates on LR therefore we must move the operand accordingly.
11953 SDValue Chain =
11954 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11955 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11956 }
11957 return SDValue(St, 0);
11958}
11959
11960/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11961/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11962SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11963 SelectionDAG &DAG) const {
11964 SDValue Lo, Hi;
11965 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11966 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11967}
11968
11970 const GlobalAddressSDNode *GA) const {
11971 // Offsets are folded in the DAG combine rather than here so that we can
11972 // intelligently choose an offset based on the uses.
11973 return false;
11974}
11975
11977 bool OptForSize) const {
11978 bool IsLegal = false;
11979 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11980 // 16-bit case when target has full fp16 support.
11981 // We encode bf16 bit patterns as if they were fp16. This results in very
11982 // strange looking assembly but should populate the register with appropriate
11983 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11984 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11985 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11986 // FIXME: We should be able to handle f128 as well with a clever lowering.
11987 const APInt ImmInt = Imm.bitcastToAPInt();
11988 if (VT == MVT::f64)
11989 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11990 else if (VT == MVT::f32)
11991 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11992 else if (VT == MVT::f16 || VT == MVT::bf16)
11993 IsLegal =
11994 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11995 Imm.isPosZero();
11996
11997 // If we can not materialize in immediate field for fmov, check if the
11998 // value can be encoded as the immediate operand of a logical instruction.
11999 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12000 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12001 // generate that fmov.
12002 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12003 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12004 // however the mov+fmov sequence is always better because of the reduced
12005 // cache pressure. The timings are still the same if you consider
12006 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12007 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12010 assert(Insn.size() <= 4 &&
12011 "Should be able to build any value with at most 4 moves");
12012 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12013 IsLegal = Insn.size() <= Limit;
12014 }
12015
12016 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12017 << " imm value: "; Imm.dump(););
12018 return IsLegal;
12019}
12020
12021//===----------------------------------------------------------------------===//
12022// AArch64 Optimization Hooks
12023//===----------------------------------------------------------------------===//
12024
12025static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12026 SDValue Operand, SelectionDAG &DAG,
12027 int &ExtraSteps) {
12028 EVT VT = Operand.getValueType();
12029 if ((ST->hasNEON() &&
12030 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12031 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12032 VT == MVT::v4f32)) ||
12033 (ST->hasSVE() &&
12034 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12036 // For the reciprocal estimates, convergence is quadratic, so the number
12037 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12038 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12039 // the result for float (23 mantissa bits) is 2 and for double (52
12040 // mantissa bits) is 3.
12041 constexpr unsigned AccurateBits = 8;
12042 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12043 ExtraSteps = DesiredBits <= AccurateBits
12044 ? 0
12045 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12046 }
12047
12048 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12049 }
12050
12051 return SDValue();
12052}
12053
12054SDValue
12055AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12056 const DenormalMode &Mode) const {
12057 SDLoc DL(Op);
12058 EVT VT = Op.getValueType();
12059 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12060 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12061 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12062}
12063
12064SDValue
12065AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12066 SelectionDAG &DAG) const {
12067 return Op;
12068}
12069
12070SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12071 SelectionDAG &DAG, int Enabled,
12072 int &ExtraSteps,
12073 bool &UseOneConst,
12074 bool Reciprocal) const {
12076 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12077 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12078 DAG, ExtraSteps)) {
12079 SDLoc DL(Operand);
12080 EVT VT = Operand.getValueType();
12081
12083
12084 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12085 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12086 for (int i = ExtraSteps; i > 0; --i) {
12087 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12088 Flags);
12089 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12090 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12091 }
12092 if (!Reciprocal)
12093 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12094
12095 ExtraSteps = 0;
12096 return Estimate;
12097 }
12098
12099 return SDValue();
12100}
12101
12102SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12103 SelectionDAG &DAG, int Enabled,
12104 int &ExtraSteps) const {
12106 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12107 DAG, ExtraSteps)) {
12108 SDLoc DL(Operand);
12109 EVT VT = Operand.getValueType();
12110
12112
12113 // Newton reciprocal iteration: E * (2 - X * E)
12114 // AArch64 reciprocal iteration instruction: (2 - M * N)
12115 for (int i = ExtraSteps; i > 0; --i) {
12116 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12117 Estimate, Flags);
12118 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12119 }
12120
12121 ExtraSteps = 0;
12122 return Estimate;
12123 }
12124
12125 return SDValue();
12126}
12127
12128//===----------------------------------------------------------------------===//
12129// AArch64 Inline Assembly Support
12130//===----------------------------------------------------------------------===//
12131
12132// Table of Constraints
12133// TODO: This is the current set of constraints supported by ARM for the
12134// compiler, not all of them may make sense.
12135//
12136// r - A general register
12137// w - An FP/SIMD register of some size in the range v0-v31
12138// x - An FP/SIMD register of some size in the range v0-v15
12139// I - Constant that can be used with an ADD instruction
12140// J - Constant that can be used with a SUB instruction
12141// K - Constant that can be used with a 32-bit logical instruction
12142// L - Constant that can be used with a 64-bit logical instruction
12143// M - Constant that can be used as a 32-bit MOV immediate
12144// N - Constant that can be used as a 64-bit MOV immediate
12145// Q - A memory reference with base register and no offset
12146// S - A symbolic address
12147// Y - Floating point constant zero
12148// Z - Integer constant zero
12149//
12150// Note that general register operands will be output using their 64-bit x
12151// register name, whatever the size of the variable, unless the asm operand
12152// is prefixed by the %w modifier. Floating-point and SIMD register operands
12153// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12154// %q modifier.
12155const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12156 // At this point, we have to lower this constraint to something else, so we
12157 // lower it to an "r" or "w". However, by doing this we will force the result
12158 // to be in register, while the X constraint is much more permissive.
12159 //
12160 // Although we are correct (we are free to emit anything, without
12161 // constraints), we might break use cases that would expect us to be more
12162 // efficient and emit something else.
12163 if (!Subtarget->hasFPARMv8())
12164 return "r";
12165
12166 if (ConstraintVT.isFloatingPoint())
12167 return "w";
12168
12169 if (ConstraintVT.isVector() &&
12170 (ConstraintVT.getSizeInBits() == 64 ||
12171 ConstraintVT.getSizeInBits() == 128))
12172 return "w";
12173
12174 return "r";
12175}
12176
12178
12179// Returns a {Reg, RegisterClass} tuple if the constraint is
12180// a specific predicate register.
12181//
12182// For some constraint like "{pn3}" the default path in
12183// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12184// suitable register class for this register is "PPRorPNR", after which it
12185// determines that nxv16i1 is an appropriate type for the constraint, which is
12186// not what we want. The code here pre-empts this by matching the register
12187// explicitly.
12188static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12190 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12191 Constraint[1] != 'p')
12192 return std::nullopt;
12193
12194 Constraint = Constraint.substr(2, Constraint.size() - 3);
12195 bool IsPredicateAsCount = Constraint.starts_with("n");
12196 if (IsPredicateAsCount)
12197 Constraint = Constraint.drop_front(1);
12198
12199 unsigned V;
12200 if (Constraint.getAsInteger(10, V) || V > 31)
12201 return std::nullopt;
12202
12203 if (IsPredicateAsCount)
12204 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12205 else
12206 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12207}
12208
12209static std::optional<PredicateConstraint>
12212 .Case("Uph", PredicateConstraint::Uph)
12213 .Case("Upl", PredicateConstraint::Upl)
12214 .Case("Upa", PredicateConstraint::Upa)
12215 .Default(std::nullopt);
12216}
12217
12218static const TargetRegisterClass *
12220 if (VT != MVT::aarch64svcount &&
12221 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12222 return nullptr;
12223
12224 switch (Constraint) {
12225 case PredicateConstraint::Uph:
12226 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12227 : &AArch64::PPR_p8to15RegClass;
12228 case PredicateConstraint::Upl:
12229 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12230 : &AArch64::PPR_3bRegClass;
12231 case PredicateConstraint::Upa:
12232 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12233 : &AArch64::PPRRegClass;
12234 }
12235
12236 llvm_unreachable("Missing PredicateConstraint!");
12237}
12238
12240
12241static std::optional<ReducedGprConstraint>
12244 .Case("Uci", ReducedGprConstraint::Uci)
12245 .Case("Ucj", ReducedGprConstraint::Ucj)
12246 .Default(std::nullopt);
12247}
12248
12249static const TargetRegisterClass *
12251 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12252 return nullptr;
12253
12254 switch (Constraint) {
12255 case ReducedGprConstraint::Uci:
12256 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12257 case ReducedGprConstraint::Ucj:
12258 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12259 }
12260
12261 llvm_unreachable("Missing ReducedGprConstraint!");
12262}
12263
12264// The set of cc code supported is from
12265// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12268 .Case("{@cchi}", AArch64CC::HI)
12269 .Case("{@cccs}", AArch64CC::HS)
12270 .Case("{@cclo}", AArch64CC::LO)
12271 .Case("{@ccls}", AArch64CC::LS)
12272 .Case("{@cccc}", AArch64CC::LO)
12273 .Case("{@cceq}", AArch64CC::EQ)
12274 .Case("{@ccgt}", AArch64CC::GT)
12275 .Case("{@ccge}", AArch64CC::GE)
12276 .Case("{@cclt}", AArch64CC::LT)
12277 .Case("{@ccle}", AArch64CC::LE)
12278 .Case("{@cchs}", AArch64CC::HS)
12279 .Case("{@ccne}", AArch64CC::NE)
12280 .Case("{@ccvc}", AArch64CC::VC)
12281 .Case("{@ccpl}", AArch64CC::PL)
12282 .Case("{@ccvs}", AArch64CC::VS)
12283 .Case("{@ccmi}", AArch64CC::MI)
12285 return Cond;
12286}
12287
12288/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12289/// WZR, invert(<cond>)'.
12291 SelectionDAG &DAG) {
12292 return DAG.getNode(
12293 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
12294 DAG.getConstant(0, DL, MVT::i32),
12295 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
12296}
12297
12298// Lower @cc flag output via getSETCC.
12299SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12300 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12301 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12302 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12303 if (Cond == AArch64CC::Invalid)
12304 return SDValue();
12305 // The output variable should be a scalar integer.
12306 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12307 OpInfo.ConstraintVT.getSizeInBits() < 8)
12308 report_fatal_error("Flag output operand is of invalid type");
12309
12310 // Get NZCV register. Only update chain when copyfrom is glued.
12311 if (Glue.getNode()) {
12312 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
12313 Chain = Glue.getValue(1);
12314 } else
12315 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
12316 // Extract CC code.
12317 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12318
12320
12321 // Truncate or ZERO_EXTEND based on value types.
12322 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12323 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12324 else
12325 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12326
12327 return Result;
12328}
12329
12330/// getConstraintType - Given a constraint letter, return the type of
12331/// constraint it is for this target.
12333AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12334 if (Constraint.size() == 1) {
12335 switch (Constraint[0]) {
12336 default:
12337 break;
12338 case 'x':
12339 case 'w':
12340 case 'y':
12341 return C_RegisterClass;
12342 // An address with a single base register. Due to the way we
12343 // currently handle addresses it is the same as 'r'.
12344 case 'Q':
12345 return C_Memory;
12346 case 'I':
12347 case 'J':
12348 case 'K':
12349 case 'L':
12350 case 'M':
12351 case 'N':
12352 case 'Y':
12353 case 'Z':
12354 return C_Immediate;
12355 case 'z':
12356 case 'S': // A symbol or label reference with a constant offset
12357 return C_Other;
12358 }
12359 } else if (parsePredicateConstraint(Constraint))
12360 return C_RegisterClass;
12361 else if (parseReducedGprConstraint(Constraint))
12362 return C_RegisterClass;
12363 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12364 return C_Other;
12365 return TargetLowering::getConstraintType(Constraint);
12366}
12367
12368/// Examine constraint type and operand type and determine a weight value.
12369/// This object must already have been set up with the operand type
12370/// and the current alternative constraint selected.
12372AArch64TargetLowering::getSingleConstraintMatchWeight(
12373 AsmOperandInfo &info, const char *constraint) const {
12375 Value *CallOperandVal = info.CallOperandVal;
12376 // If we don't have a value, we can't do a match,
12377 // but allow it at the lowest weight.
12378 if (!CallOperandVal)
12379 return CW_Default;
12380 Type *type = CallOperandVal->getType();
12381 // Look at the constraint type.
12382 switch (*constraint) {
12383 default:
12385 break;
12386 case 'x':
12387 case 'w':
12388 case 'y':
12389 if (type->isFloatingPointTy() || type->isVectorTy())
12390 weight = CW_Register;
12391 break;
12392 case 'z':
12393 weight = CW_Constant;
12394 break;
12395 case 'U':
12396 if (parsePredicateConstraint(constraint) ||
12397 parseReducedGprConstraint(constraint))
12398 weight = CW_Register;
12399 break;
12400 }
12401 return weight;
12402}
12403
12404std::pair<unsigned, const TargetRegisterClass *>
12405AArch64TargetLowering::getRegForInlineAsmConstraint(
12406 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12407 if (Constraint.size() == 1) {
12408 switch (Constraint[0]) {
12409 case 'r':
12410 if (VT.isScalableVector())
12411 return std::make_pair(0U, nullptr);
12412 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12413 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12414 if (VT.getFixedSizeInBits() == 64)
12415 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12416 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12417 case 'w': {
12418 if (!Subtarget->hasFPARMv8())
12419 break;
12420 if (VT.isScalableVector()) {
12421 if (VT.getVectorElementType() != MVT::i1)
12422 return std::make_pair(0U, &AArch64::ZPRRegClass);
12423 return std::make_pair(0U, nullptr);
12424 }
12425 if (VT == MVT::Other)
12426 break;
12427 uint64_t VTSize = VT.getFixedSizeInBits();
12428 if (VTSize == 16)
12429 return std::make_pair(0U, &AArch64::FPR16RegClass);
12430 if (VTSize == 32)
12431 return std::make_pair(0U, &AArch64::FPR32RegClass);
12432 if (VTSize == 64)
12433 return std::make_pair(0U, &AArch64::FPR64RegClass);
12434 if (VTSize == 128)
12435 return std::make_pair(0U, &AArch64::FPR128RegClass);
12436 break;
12437 }
12438 // The instructions that this constraint is designed for can
12439 // only take 128-bit registers so just use that regclass.
12440 case 'x':
12441 if (!Subtarget->hasFPARMv8())
12442 break;
12443 if (VT.isScalableVector())
12444 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12445 if (VT.getSizeInBits() == 128)
12446 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12447 break;
12448 case 'y':
12449 if (!Subtarget->hasFPARMv8())
12450 break;
12451 if (VT.isScalableVector())
12452 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12453 break;
12454 }
12455 } else {
12456 if (const auto P = parsePredicateRegAsConstraint(Constraint))
12457 return *P;
12458 if (const auto PC = parsePredicateConstraint(Constraint))
12459 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12460 return std::make_pair(0U, RegClass);
12461
12462 if (const auto RGC = parseReducedGprConstraint(Constraint))
12463 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12464 return std::make_pair(0U, RegClass);
12465 }
12466 if (StringRef("{cc}").equals_insensitive(Constraint) ||
12468 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12469
12470 if (Constraint == "{za}") {
12471 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12472 }
12473
12474 if (Constraint == "{zt0}") {
12475 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12476 }
12477
12478 // Use the default implementation in TargetLowering to convert the register
12479 // constraint into a member of a register class.
12480 std::pair<unsigned, const TargetRegisterClass *> Res;
12482
12483 // Not found as a standard register?
12484 if (!Res.second) {
12485 unsigned Size = Constraint.size();
12486 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12487 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12488 int RegNo;
12489 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12490 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12491 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12492 // By default we'll emit v0-v31 for this unless there's a modifier where
12493 // we'll emit the correct register as well.
12494 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12495 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12496 Res.second = &AArch64::FPR64RegClass;
12497 } else {
12498 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12499 Res.second = &AArch64::FPR128RegClass;
12500 }
12501 }
12502 }
12503 }
12504
12505 if (Res.second && !Subtarget->hasFPARMv8() &&
12506 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12507 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12508 return std::make_pair(0U, nullptr);
12509
12510 return Res;
12511}
12512
12514 llvm::Type *Ty,
12515 bool AllowUnknown) const {
12516 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12517 return EVT(MVT::i64x8);
12518
12519 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12520}
12521
12522/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12523/// vector. If it is invalid, don't add anything to Ops.
12524void AArch64TargetLowering::LowerAsmOperandForConstraint(
12525 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12526 SelectionDAG &DAG) const {
12527 SDValue Result;
12528
12529 // Currently only support length 1 constraints.
12530 if (Constraint.size() != 1)
12531 return;
12532
12533 char ConstraintLetter = Constraint[0];
12534 switch (ConstraintLetter) {
12535 default:
12536 break;
12537
12538 // This set of constraints deal with valid constants for various instructions.
12539 // Validate and return a target constant for them if we can.
12540 case 'z': {
12541 // 'z' maps to xzr or wzr so it needs an input of 0.
12542 if (!isNullConstant(Op))
12543 return;
12544
12545 if (Op.getValueType() == MVT::i64)
12546 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12547 else
12548 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
12549 break;
12550 }
12551 case 'S':
12552 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12553 // supported for PIC while "s" isn't, making "s" less useful. We implement
12554 // "S" but not "s".
12556 break;
12557
12558 case 'I':
12559 case 'J':
12560 case 'K':
12561 case 'L':
12562 case 'M':
12563 case 'N':
12564 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12565 if (!C)
12566 return;
12567
12568 // Grab the value and do some validation.
12569 uint64_t CVal = C->getZExtValue();
12570 switch (ConstraintLetter) {
12571 // The I constraint applies only to simple ADD or SUB immediate operands:
12572 // i.e. 0 to 4095 with optional shift by 12
12573 // The J constraint applies only to ADD or SUB immediates that would be
12574 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12575 // instruction [or vice versa], in other words -1 to -4095 with optional
12576 // left shift by 12.
12577 case 'I':
12578 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
12579 break;
12580 return;
12581 case 'J': {
12582 uint64_t NVal = -C->getSExtValue();
12583 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
12584 CVal = C->getSExtValue();
12585 break;
12586 }
12587 return;
12588 }
12589 // The K and L constraints apply *only* to logical immediates, including
12590 // what used to be the MOVI alias for ORR (though the MOVI alias has now
12591 // been removed and MOV should be used). So these constraints have to
12592 // distinguish between bit patterns that are valid 32-bit or 64-bit
12593 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12594 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12595 // versa.
12596 case 'K':
12597 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12598 break;
12599 return;
12600 case 'L':
12601 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12602 break;
12603 return;
12604 // The M and N constraints are a superset of K and L respectively, for use
12605 // with the MOV (immediate) alias. As well as the logical immediates they
12606 // also match 32 or 64-bit immediates that can be loaded either using a
12607 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12608 // (M) or 64-bit 0x1234000000000000 (N) etc.
12609 // As a note some of this code is liberally stolen from the asm parser.
12610 case 'M': {
12611 if (!isUInt<32>(CVal))
12612 return;
12613 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12614 break;
12615 if ((CVal & 0xFFFF) == CVal)
12616 break;
12617 if ((CVal & 0xFFFF0000ULL) == CVal)
12618 break;
12619 uint64_t NCVal = ~(uint32_t)CVal;
12620 if ((NCVal & 0xFFFFULL) == NCVal)
12621 break;
12622 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12623 break;
12624 return;
12625 }
12626 case 'N': {
12627 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12628 break;
12629 if ((CVal & 0xFFFFULL) == CVal)
12630 break;
12631 if ((CVal & 0xFFFF0000ULL) == CVal)
12632 break;
12633 if ((CVal & 0xFFFF00000000ULL) == CVal)
12634 break;
12635 if ((CVal & 0xFFFF000000000000ULL) == CVal)
12636 break;
12637 uint64_t NCVal = ~CVal;
12638 if ((NCVal & 0xFFFFULL) == NCVal)
12639 break;
12640 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12641 break;
12642 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12643 break;
12644 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12645 break;
12646 return;
12647 }
12648 default:
12649 return;
12650 }
12651
12652 // All assembler immediates are 64-bit integers.
12653 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
12654 break;
12655 }
12656
12657 if (Result.getNode()) {
12658 Ops.push_back(Result);
12659 return;
12660 }
12661
12662 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12663}
12664
12665//===----------------------------------------------------------------------===//
12666// AArch64 Advanced SIMD Support
12667//===----------------------------------------------------------------------===//
12668
12669/// WidenVector - Given a value in the V64 register class, produce the
12670/// equivalent value in the V128 register class.
12672 EVT VT = V64Reg.getValueType();
12673 unsigned NarrowSize = VT.getVectorNumElements();
12674 MVT EltTy = VT.getVectorElementType().getSimpleVT();
12675 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
12676 SDLoc DL(V64Reg);
12677
12678 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
12679 V64Reg, DAG.getConstant(0, DL, MVT::i64));
12680}
12681
12682/// getExtFactor - Determine the adjustment factor for the position when
12683/// generating an "extract from vector registers" instruction.
12684static unsigned getExtFactor(SDValue &V) {
12685 EVT EltType = V.getValueType().getVectorElementType();
12686 return EltType.getSizeInBits() / 8;
12687}
12688
12689// Check if a vector is built from one vector via extracted elements of
12690// another together with an AND mask, ensuring that all elements fit
12691// within range. This can be reconstructed using AND and NEON's TBL1.
12693 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12694 SDLoc dl(Op);
12695 EVT VT = Op.getValueType();
12696 assert(!VT.isScalableVector() &&
12697 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12698
12699 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12700 // directly to TBL1.
12701 if (VT != MVT::v16i8 && VT != MVT::v8i8)
12702 return SDValue();
12703
12704 unsigned NumElts = VT.getVectorNumElements();
12705 assert((NumElts == 8 || NumElts == 16) &&
12706 "Need to have exactly 8 or 16 elements in vector.");
12707
12708 SDValue SourceVec;
12709 SDValue MaskSourceVec;
12710 SmallVector<SDValue, 16> AndMaskConstants;
12711
12712 for (unsigned i = 0; i < NumElts; ++i) {
12713 SDValue V = Op.getOperand(i);
12714 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12715 return SDValue();
12716
12717 SDValue OperandSourceVec = V.getOperand(0);
12718 if (!SourceVec)
12719 SourceVec = OperandSourceVec;
12720 else if (SourceVec != OperandSourceVec)
12721 return SDValue();
12722
12723 // This only looks at shuffles with elements that are
12724 // a) truncated by a constant AND mask extracted from a mask vector, or
12725 // b) extracted directly from a mask vector.
12726 SDValue MaskSource = V.getOperand(1);
12727 if (MaskSource.getOpcode() == ISD::AND) {
12728 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
12729 return SDValue();
12730
12731 AndMaskConstants.push_back(MaskSource.getOperand(1));
12732 MaskSource = MaskSource->getOperand(0);
12733 } else if (!AndMaskConstants.empty()) {
12734 // Either all or no operands should have an AND mask.
12735 return SDValue();
12736 }
12737
12738 // An ANY_EXTEND may be inserted between the AND and the source vector
12739 // extraction. We don't care about that, so we can just skip it.
12740 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12741 MaskSource = MaskSource.getOperand(0);
12742
12743 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12744 return SDValue();
12745
12746 SDValue MaskIdx = MaskSource.getOperand(1);
12747 if (!isa<ConstantSDNode>(MaskIdx) ||
12748 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
12749 return SDValue();
12750
12751 // We only apply this if all elements come from the same vector with the
12752 // same vector type.
12753 if (!MaskSourceVec) {
12754 MaskSourceVec = MaskSource->getOperand(0);
12755 if (MaskSourceVec.getValueType() != VT)
12756 return SDValue();
12757 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
12758 return SDValue();
12759 }
12760 }
12761
12762 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12763 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12764 // insert, we know that the index in the mask must be smaller than the number
12765 // of elements in the source, or we would have an out-of-bounds access.
12766 if (NumElts == 8)
12767 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
12768 DAG.getUNDEF(VT));
12769
12770 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12771 if (!AndMaskConstants.empty())
12772 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
12773 DAG.getBuildVector(VT, dl, AndMaskConstants));
12774
12775 return DAG.getNode(
12777 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
12778 MaskSourceVec);
12779}
12780
12781// Gather data to see if the operation can be modelled as a
12782// shuffle in combination with VEXTs.
12784 SelectionDAG &DAG) const {
12785 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12786 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12787 SDLoc dl(Op);
12788 EVT VT = Op.getValueType();
12789 assert(!VT.isScalableVector() &&
12790 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12791 unsigned NumElts = VT.getVectorNumElements();
12792
12793 struct ShuffleSourceInfo {
12794 SDValue Vec;
12795 unsigned MinElt;
12796 unsigned MaxElt;
12797
12798 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12799 // be compatible with the shuffle we intend to construct. As a result
12800 // ShuffleVec will be some sliding window into the original Vec.
12801 SDValue ShuffleVec;
12802
12803 // Code should guarantee that element i in Vec starts at element "WindowBase
12804 // + i * WindowScale in ShuffleVec".
12805 int WindowBase;
12806 int WindowScale;
12807
12808 ShuffleSourceInfo(SDValue Vec)
12809 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12810 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12811
12812 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12813 };
12814
12815 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12816 // node.
12818 for (unsigned i = 0; i < NumElts; ++i) {
12819 SDValue V = Op.getOperand(i);
12820 if (V.isUndef())
12821 continue;
12822 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12823 !isa<ConstantSDNode>(V.getOperand(1)) ||
12824 V.getOperand(0).getValueType().isScalableVector()) {
12825 LLVM_DEBUG(
12826 dbgs() << "Reshuffle failed: "
12827 "a shuffle can only come from building a vector from "
12828 "various elements of other fixed-width vectors, provided "
12829 "their indices are constant\n");
12830 return SDValue();
12831 }
12832
12833 // Add this element source to the list if it's not already there.
12834 SDValue SourceVec = V.getOperand(0);
12835 auto Source = find(Sources, SourceVec);
12836 if (Source == Sources.end())
12837 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
12838
12839 // Update the minimum and maximum lane number seen.
12840 unsigned EltNo = V.getConstantOperandVal(1);
12841 Source->MinElt = std::min(Source->MinElt, EltNo);
12842 Source->MaxElt = std::max(Source->MaxElt, EltNo);
12843 }
12844
12845 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12846 // better than moving to/from gpr registers for larger vectors.
12847 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12848 // Construct a mask for the tbl. We may need to adjust the index for types
12849 // larger than i8.
12851 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12852 for (unsigned I = 0; I < NumElts; ++I) {
12853 SDValue V = Op.getOperand(I);
12854 if (V.isUndef()) {
12855 for (unsigned OF = 0; OF < OutputFactor; OF++)
12856 Mask.push_back(-1);
12857 continue;
12858 }
12859 // Set the Mask lanes adjusted for the size of the input and output
12860 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12861 // output element, adjusted in their positions per input and output types.
12862 unsigned Lane = V.getConstantOperandVal(1);
12863 for (unsigned S = 0; S < Sources.size(); S++) {
12864 if (V.getOperand(0) == Sources[S].Vec) {
12865 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12866 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12867 for (unsigned OF = 0; OF < OutputFactor; OF++)
12868 Mask.push_back(InputBase + OF);
12869 break;
12870 }
12871 }
12872 }
12873
12874 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12875 // v16i8, and the TBLMask
12876 SmallVector<SDValue, 16> TBLOperands;
12877 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
12878 ? Intrinsic::aarch64_neon_tbl3
12879 : Intrinsic::aarch64_neon_tbl4,
12880 dl, MVT::i32));
12881 for (unsigned i = 0; i < Sources.size(); i++) {
12882 SDValue Src = Sources[i].Vec;
12883 EVT SrcVT = Src.getValueType();
12884 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
12885 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12886 "Expected a legally typed vector");
12887 if (SrcVT.is64BitVector())
12888 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
12889 DAG.getUNDEF(MVT::v8i8));
12890 TBLOperands.push_back(Src);
12891 }
12892
12894 for (unsigned i = 0; i < Mask.size(); i++)
12895 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
12896 assert((Mask.size() == 8 || Mask.size() == 16) &&
12897 "Expected a v8i8 or v16i8 Mask");
12898 TBLOperands.push_back(
12899 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
12900
12901 SDValue Shuffle =
12903 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
12904 return DAG.getBitcast(VT, Shuffle);
12905 }
12906
12907 if (Sources.size() > 2) {
12908 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12909 << "sensible when at most two source vectors are "
12910 << "involved\n");
12911 return SDValue();
12912 }
12913
12914 // Find out the smallest element size among result and two sources, and use
12915 // it as element size to build the shuffle_vector.
12916 EVT SmallestEltTy = VT.getVectorElementType();
12917 for (auto &Source : Sources) {
12918 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12919 if (SrcEltTy.bitsLT(SmallestEltTy)) {
12920 SmallestEltTy = SrcEltTy;
12921 }
12922 }
12923 unsigned ResMultiplier =
12924 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12925 uint64_t VTSize = VT.getFixedSizeInBits();
12926 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12927 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
12928
12929 // If the source vector is too wide or too narrow, we may nevertheless be able
12930 // to construct a compatible shuffle either by concatenating it with UNDEF or
12931 // extracting a suitable range of elements.
12932 for (auto &Src : Sources) {
12933 EVT SrcVT = Src.ShuffleVec.getValueType();
12934
12935 TypeSize SrcVTSize = SrcVT.getSizeInBits();
12936 if (SrcVTSize == TypeSize::getFixed(VTSize))
12937 continue;
12938
12939 // This stage of the search produces a source with the same element type as
12940 // the original, but with a total width matching the BUILD_VECTOR output.
12941 EVT EltVT = SrcVT.getVectorElementType();
12942 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12943 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
12944
12945 if (SrcVTSize.getFixedValue() < VTSize) {
12946 assert(2 * SrcVTSize == VTSize);
12947 // We can pad out the smaller vector for free, so if it's part of a
12948 // shuffle...
12949 Src.ShuffleVec =
12950 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
12951 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
12952 continue;
12953 }
12954
12955 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12956 LLVM_DEBUG(
12957 dbgs() << "Reshuffle failed: result vector too small to extract\n");
12958 return SDValue();
12959 }
12960
12961 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12962 LLVM_DEBUG(
12963 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12964 return SDValue();
12965 }
12966
12967 if (Src.MinElt >= NumSrcElts) {
12968 // The extraction can just take the second half
12969 Src.ShuffleVec =
12970 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12971 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12972 Src.WindowBase = -NumSrcElts;
12973 } else if (Src.MaxElt < NumSrcElts) {
12974 // The extraction can just take the first half
12975 Src.ShuffleVec =
12976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12977 DAG.getConstant(0, dl, MVT::i64));
12978 } else {
12979 // An actual VEXT is needed
12980 SDValue VEXTSrc1 =
12981 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12982 DAG.getConstant(0, dl, MVT::i64));
12983 SDValue VEXTSrc2 =
12984 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12985 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12986 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12987
12988 if (!SrcVT.is64BitVector()) {
12989 LLVM_DEBUG(
12990 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12991 "for SVE vectors.");
12992 return SDValue();
12993 }
12994
12995 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12996 VEXTSrc2,
12997 DAG.getConstant(Imm, dl, MVT::i32));
12998 Src.WindowBase = -Src.MinElt;
12999 }
13000 }
13001
13002 // Another possible incompatibility occurs from the vector element types. We
13003 // can fix this by bitcasting the source vectors to the same type we intend
13004 // for the shuffle.
13005 for (auto &Src : Sources) {
13006 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13007 if (SrcEltTy == SmallestEltTy)
13008 continue;
13009 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13010 if (DAG.getDataLayout().isBigEndian()) {
13011 Src.ShuffleVec =
13012 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
13013 } else {
13014 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
13015 }
13016 Src.WindowScale =
13017 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13018 Src.WindowBase *= Src.WindowScale;
13019 }
13020
13021 // Final check before we try to actually produce a shuffle.
13022 LLVM_DEBUG({
13023 for (auto Src : Sources)
13024 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13025 });
13026
13027 // The stars all align, our next step is to produce the mask for the shuffle.
13028 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13029 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13030 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13031 SDValue Entry = Op.getOperand(i);
13032 if (Entry.isUndef())
13033 continue;
13034
13035 auto Src = find(Sources, Entry.getOperand(0));
13036 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13037
13038 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13039 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13040 // segment.
13041 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13042 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13043 VT.getScalarSizeInBits());
13044 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13045
13046 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13047 // starting at the appropriate offset.
13048 int *LaneMask = &Mask[i * ResMultiplier];
13049
13050 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13051 ExtractBase += NumElts * (Src - Sources.begin());
13052 for (int j = 0; j < LanesDefined; ++j)
13053 LaneMask[j] = ExtractBase + j;
13054 }
13055
13056 // Final check before we try to produce nonsense...
13057 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13058 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13059 return SDValue();
13060 }
13061
13062 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13063 for (unsigned i = 0; i < Sources.size(); ++i)
13064 ShuffleOps[i] = Sources[i].ShuffleVec;
13065
13066 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
13067 ShuffleOps[1], Mask);
13068 SDValue V;
13069 if (DAG.getDataLayout().isBigEndian()) {
13070 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
13071 } else {
13072 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13073 }
13074
13075 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13076 dbgs() << "Reshuffle, creating node: "; V.dump(););
13077
13078 return V;
13079}
13080
13081// check if an EXT instruction can handle the shuffle mask when the
13082// vector sources of the shuffle are the same.
13083static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13084 unsigned NumElts = VT.getVectorNumElements();
13085
13086 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13087 if (M[0] < 0)
13088 return false;
13089
13090 Imm = M[0];
13091
13092 // If this is a VEXT shuffle, the immediate value is the index of the first
13093 // element. The other shuffle indices must be the successive elements after
13094 // the first one.
13095 unsigned ExpectedElt = Imm;
13096 for (unsigned i = 1; i < NumElts; ++i) {
13097 // Increment the expected index. If it wraps around, just follow it
13098 // back to index zero and keep going.
13099 ++ExpectedElt;
13100 if (ExpectedElt == NumElts)
13101 ExpectedElt = 0;
13102
13103 if (M[i] < 0)
13104 continue; // ignore UNDEF indices
13105 if (ExpectedElt != static_cast<unsigned>(M[i]))
13106 return false;
13107 }
13108
13109 return true;
13110}
13111
13112// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13113// v4i32s. This is really a truncate, which we can construct out of (legal)
13114// concats and truncate nodes.
13116 if (V.getValueType() != MVT::v16i8)
13117 return SDValue();
13118 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13119
13120 for (unsigned X = 0; X < 4; X++) {
13121 // Check the first item in each group is an extract from lane 0 of a v4i32
13122 // or v4i16.
13123 SDValue BaseExt = V.getOperand(X * 4);
13124 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13125 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13126 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13127 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13128 BaseExt.getConstantOperandVal(1) != 0)
13129 return SDValue();
13130 SDValue Base = BaseExt.getOperand(0);
13131 // And check the other items are extracts from the same vector.
13132 for (unsigned Y = 1; Y < 4; Y++) {
13133 SDValue Ext = V.getOperand(X * 4 + Y);
13134 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13135 Ext.getOperand(0) != Base ||
13136 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13137 Ext.getConstantOperandVal(1) != Y)
13138 return SDValue();
13139 }
13140 }
13141
13142 // Turn the buildvector into a series of truncates and concates, which will
13143 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13144 // concat together to produce 2 v8i16. These are both truncated and concat
13145 // together.
13146 SDLoc DL(V);
13147 SDValue Trunc[4] = {
13148 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13149 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13150 for (SDValue &V : Trunc)
13151 if (V.getValueType() == MVT::v4i32)
13152 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13153 SDValue Concat0 =
13154 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13155 SDValue Concat1 =
13156 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13157 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13158 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13159 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13160}
13161
13162/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13163/// element width than the vector lane type. If that is the case the function
13164/// returns true and writes the value of the DUP instruction lane operand into
13165/// DupLaneOp
13166static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13167 unsigned &DupLaneOp) {
13168 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13169 "Only possible block sizes for wide DUP are: 16, 32, 64");
13170
13171 if (BlockSize <= VT.getScalarSizeInBits())
13172 return false;
13173 if (BlockSize % VT.getScalarSizeInBits() != 0)
13174 return false;
13175 if (VT.getSizeInBits() % BlockSize != 0)
13176 return false;
13177
13178 size_t SingleVecNumElements = VT.getVectorNumElements();
13179 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13180 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13181
13182 // We are looking for masks like
13183 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13184 // might be replaced by 'undefined'. BlockIndices will eventually contain
13185 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13186 // for the above examples)
13187 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13188 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13189 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13190 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13191 if (Elt < 0)
13192 continue;
13193 // For now we don't support shuffles that use the second operand
13194 if ((unsigned)Elt >= SingleVecNumElements)
13195 return false;
13196 if (BlockElts[I] < 0)
13197 BlockElts[I] = Elt;
13198 else if (BlockElts[I] != Elt)
13199 return false;
13200 }
13201
13202 // We found a candidate block (possibly with some undefs). It must be a
13203 // sequence of consecutive integers starting with a value divisible by
13204 // NumEltsPerBlock with some values possibly replaced by undef-s.
13205
13206 // Find first non-undef element
13207 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13208 assert(FirstRealEltIter != BlockElts.end() &&
13209 "Shuffle with all-undefs must have been caught by previous cases, "
13210 "e.g. isSplat()");
13211 if (FirstRealEltIter == BlockElts.end()) {
13212 DupLaneOp = 0;
13213 return true;
13214 }
13215
13216 // Index of FirstRealElt in BlockElts
13217 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13218
13219 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13220 return false;
13221 // BlockElts[0] must have the following value if it isn't undef:
13222 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13223
13224 // Check the first element
13225 if (Elt0 % NumEltsPerBlock != 0)
13226 return false;
13227 // Check that the sequence indeed consists of consecutive integers (modulo
13228 // undefs)
13229 for (size_t I = 0; I < NumEltsPerBlock; I++)
13230 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13231 return false;
13232
13233 DupLaneOp = Elt0 / NumEltsPerBlock;
13234 return true;
13235}
13236
13237// check if an EXT instruction can handle the shuffle mask when the
13238// vector sources of the shuffle are different.
13239static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13240 unsigned &Imm) {
13241 // Look for the first non-undef element.
13242 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13243
13244 // Benefit form APInt to handle overflow when calculating expected element.
13245 unsigned NumElts = VT.getVectorNumElements();
13246 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13247 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13248 /*implicitTrunc=*/true);
13249 // The following shuffle indices must be the successive elements after the
13250 // first real element.
13251 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13252 return Elt != ExpectedElt++ && Elt != -1;
13253 });
13254 if (FoundWrongElt)
13255 return false;
13256
13257 // The index of an EXT is the first element if it is not UNDEF.
13258 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13259 // value of the first element. E.g.
13260 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13261 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13262 // ExpectedElt is the last mask index plus 1.
13263 Imm = ExpectedElt.getZExtValue();
13264
13265 // There are two difference cases requiring to reverse input vectors.
13266 // For example, for vector <4 x i32> we have the following cases,
13267 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13268 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13269 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13270 // to reverse two input vectors.
13271 if (Imm < NumElts)
13272 ReverseEXT = true;
13273 else
13274 Imm -= NumElts;
13275
13276 return true;
13277}
13278
13279/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13280/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13281/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13282static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13283 unsigned NumElts = VT.getVectorNumElements();
13284 if (NumElts % 2 != 0)
13285 return false;
13286 WhichResult = (M[0] == 0 ? 0 : 1);
13287 unsigned Idx = WhichResult * NumElts / 2;
13288 for (unsigned i = 0; i != NumElts; i += 2) {
13289 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13290 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13291 return false;
13292 Idx += 1;
13293 }
13294
13295 return true;
13296}
13297
13298/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13299/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13300/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13301static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13302 unsigned Half = VT.getVectorNumElements() / 2;
13303 WhichResult = (M[0] == 0 ? 0 : 1);
13304 for (unsigned j = 0; j != 2; ++j) {
13305 unsigned Idx = WhichResult;
13306 for (unsigned i = 0; i != Half; ++i) {
13307 int MIdx = M[i + j * Half];
13308 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13309 return false;
13310 Idx += 2;
13311 }
13312 }
13313
13314 return true;
13315}
13316
13317/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13318/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13319/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13320static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13321 unsigned NumElts = VT.getVectorNumElements();
13322 if (NumElts % 2 != 0)
13323 return false;
13324 WhichResult = (M[0] == 0 ? 0 : 1);
13325 for (unsigned i = 0; i < NumElts; i += 2) {
13326 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13327 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13328 return false;
13329 }
13330 return true;
13331}
13332
13333static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13334 bool &DstIsLeft, int &Anomaly) {
13335 if (M.size() != static_cast<size_t>(NumInputElements))
13336 return false;
13337
13338 int NumLHSMatch = 0, NumRHSMatch = 0;
13339 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13340
13341 for (int i = 0; i < NumInputElements; ++i) {
13342 if (M[i] == -1) {
13343 ++NumLHSMatch;
13344 ++NumRHSMatch;
13345 continue;
13346 }
13347
13348 if (M[i] == i)
13349 ++NumLHSMatch;
13350 else
13351 LastLHSMismatch = i;
13352
13353 if (M[i] == i + NumInputElements)
13354 ++NumRHSMatch;
13355 else
13356 LastRHSMismatch = i;
13357 }
13358
13359 if (NumLHSMatch == NumInputElements - 1) {
13360 DstIsLeft = true;
13361 Anomaly = LastLHSMismatch;
13362 return true;
13363 } else if (NumRHSMatch == NumInputElements - 1) {
13364 DstIsLeft = false;
13365 Anomaly = LastRHSMismatch;
13366 return true;
13367 }
13368
13369 return false;
13370}
13371
13372static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13373 if (VT.getSizeInBits() != 128)
13374 return false;
13375
13376 unsigned NumElts = VT.getVectorNumElements();
13377
13378 for (int I = 0, E = NumElts / 2; I != E; I++) {
13379 if (Mask[I] != I)
13380 return false;
13381 }
13382
13383 int Offset = NumElts / 2;
13384 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13385 if (Mask[I] != I + SplitLHS * Offset)
13386 return false;
13387 }
13388
13389 return true;
13390}
13391
13393 SDLoc DL(Op);
13394 EVT VT = Op.getValueType();
13395 SDValue V0 = Op.getOperand(0);
13396 SDValue V1 = Op.getOperand(1);
13397 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13398
13401 return SDValue();
13402
13403 bool SplitV0 = V0.getValueSizeInBits() == 128;
13404
13405 if (!isConcatMask(Mask, VT, SplitV0))
13406 return SDValue();
13407
13408 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13409 if (SplitV0) {
13410 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13411 DAG.getConstant(0, DL, MVT::i64));
13412 }
13413 if (V1.getValueSizeInBits() == 128) {
13414 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13415 DAG.getConstant(0, DL, MVT::i64));
13416 }
13417 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13418}
13419
13420/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13421/// the specified operations to build the shuffle. ID is the perfect-shuffle
13422//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13423//table entry and LHS/RHS are the immediate inputs for this stage of the
13424//shuffle.
13426 SDValue V2, unsigned PFEntry, SDValue LHS,
13427 SDValue RHS, SelectionDAG &DAG,
13428 const SDLoc &dl) {
13429 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13430 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13431 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13432
13433 enum {
13434 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13435 OP_VREV,
13436 OP_VDUP0,
13437 OP_VDUP1,
13438 OP_VDUP2,
13439 OP_VDUP3,
13440 OP_VEXT1,
13441 OP_VEXT2,
13442 OP_VEXT3,
13443 OP_VUZPL, // VUZP, left result
13444 OP_VUZPR, // VUZP, right result
13445 OP_VZIPL, // VZIP, left result
13446 OP_VZIPR, // VZIP, right result
13447 OP_VTRNL, // VTRN, left result
13448 OP_VTRNR, // VTRN, right result
13449 OP_MOVLANE // Move lane. RHSID is the lane to move into
13450 };
13451
13452 if (OpNum == OP_COPY) {
13453 if (LHSID == (1 * 9 + 2) * 9 + 3)
13454 return LHS;
13455 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13456 return RHS;
13457 }
13458
13459 if (OpNum == OP_MOVLANE) {
13460 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13461 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13462 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13463 Elt = 3 - Elt;
13464 while (Elt > 0) {
13465 ID /= 9;
13466 Elt--;
13467 }
13468 return (ID % 9 == 8) ? -1 : ID % 9;
13469 };
13470
13471 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13472 // get the lane to move from the PFID, which is always from the
13473 // original vectors (V1 or V2).
13475 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
13476 EVT VT = OpLHS.getValueType();
13477 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13478 unsigned ExtLane = 0;
13479 SDValue Input;
13480
13481 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13482 // convert into a higher type.
13483 if (RHSID & 0x4) {
13484 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13485 if (MaskElt == -1)
13486 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13487 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13488 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13489 Input = MaskElt < 2 ? V1 : V2;
13490 if (VT.getScalarSizeInBits() == 16) {
13491 Input = DAG.getBitcast(MVT::v2f32, Input);
13492 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13493 } else {
13494 assert(VT.getScalarSizeInBits() == 32 &&
13495 "Expected 16 or 32 bit shuffle elemements");
13496 Input = DAG.getBitcast(MVT::v2f64, Input);
13497 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13498 }
13499 } else {
13500 int MaskElt = getPFIDLane(ID, RHSID);
13501 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13502 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13503 Input = MaskElt < 4 ? V1 : V2;
13504 // Be careful about creating illegal types. Use f16 instead of i16.
13505 if (VT == MVT::v4i16) {
13506 Input = DAG.getBitcast(MVT::v4f16, Input);
13507 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13508 }
13509 }
13512 Input, DAG.getVectorIdxConstant(ExtLane, dl));
13513 SDValue Ins =
13514 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
13515 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
13516 return DAG.getBitcast(VT, Ins);
13517 }
13518
13519 SDValue OpLHS, OpRHS;
13520 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13521 RHS, DAG, dl);
13522 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13523 RHS, DAG, dl);
13524 EVT VT = OpLHS.getValueType();
13525
13526 switch (OpNum) {
13527 default:
13528 llvm_unreachable("Unknown shuffle opcode!");
13529 case OP_VREV:
13530 // VREV divides the vector in half and swaps within the half.
13531 if (VT.getVectorElementType() == MVT::i32 ||
13532 VT.getVectorElementType() == MVT::f32)
13533 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
13534 // vrev <4 x i16> -> REV32
13535 if (VT.getVectorElementType() == MVT::i16 ||
13536 VT.getVectorElementType() == MVT::f16 ||
13537 VT.getVectorElementType() == MVT::bf16)
13538 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
13539 // vrev <4 x i8> -> REV16
13540 assert(VT.getVectorElementType() == MVT::i8);
13541 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
13542 case OP_VDUP0:
13543 case OP_VDUP1:
13544 case OP_VDUP2:
13545 case OP_VDUP3: {
13546 EVT EltTy = VT.getVectorElementType();
13547 unsigned Opcode;
13548 if (EltTy == MVT::i8)
13549 Opcode = AArch64ISD::DUPLANE8;
13550 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13551 Opcode = AArch64ISD::DUPLANE16;
13552 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13553 Opcode = AArch64ISD::DUPLANE32;
13554 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13555 Opcode = AArch64ISD::DUPLANE64;
13556 else
13557 llvm_unreachable("Invalid vector element type?");
13558
13559 if (VT.getSizeInBits() == 64)
13560 OpLHS = WidenVector(OpLHS, DAG);
13561 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
13562 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
13563 }
13564 case OP_VEXT1:
13565 case OP_VEXT2:
13566 case OP_VEXT3: {
13567 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
13568 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
13569 DAG.getConstant(Imm, dl, MVT::i32));
13570 }
13571 case OP_VUZPL:
13572 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
13573 case OP_VUZPR:
13574 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
13575 case OP_VZIPL:
13576 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
13577 case OP_VZIPR:
13578 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
13579 case OP_VTRNL:
13580 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
13581 case OP_VTRNR:
13582 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
13583 }
13584}
13585
13587 SelectionDAG &DAG) {
13588 // Check to see if we can use the TBL instruction.
13589 SDValue V1 = Op.getOperand(0);
13590 SDValue V2 = Op.getOperand(1);
13591 SDLoc DL(Op);
13592
13593 EVT EltVT = Op.getValueType().getVectorElementType();
13594 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13595
13596 bool Swap = false;
13597 if (V1.isUndef() || isZerosVector(V1.getNode())) {
13598 std::swap(V1, V2);
13599 Swap = true;
13600 }
13601
13602 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13603 // out of range values with 0s. We do need to make sure that any out-of-range
13604 // values are really out-of-range for a v16i8 vector.
13605 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
13606 MVT IndexVT = MVT::v8i8;
13607 unsigned IndexLen = 8;
13608 if (Op.getValueSizeInBits() == 128) {
13609 IndexVT = MVT::v16i8;
13610 IndexLen = 16;
13611 }
13612
13614 for (int Val : ShuffleMask) {
13615 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13616 unsigned Offset = Byte + Val * BytesPerElt;
13617 if (Swap)
13618 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13619 if (IsUndefOrZero && Offset >= IndexLen)
13620 Offset = 255;
13621 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
13622 }
13623 }
13624
13625 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
13626 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
13627
13628 SDValue Shuffle;
13629 if (IsUndefOrZero) {
13630 if (IndexLen == 8)
13631 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
13632 Shuffle = DAG.getNode(
13633 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13634 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13635 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13636 } else {
13637 if (IndexLen == 8) {
13638 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
13639 Shuffle = DAG.getNode(
13640 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13641 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13642 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13643 } else {
13644 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13645 // cannot currently represent the register constraints on the input
13646 // table registers.
13647 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13648 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13649 // IndexLen));
13650 Shuffle = DAG.getNode(
13651 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13652 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
13653 V2Cst,
13654 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13655 }
13656 }
13657 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
13658}
13659
13660static unsigned getDUPLANEOp(EVT EltType) {
13661 if (EltType == MVT::i8)
13662 return AArch64ISD::DUPLANE8;
13663 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13664 return AArch64ISD::DUPLANE16;
13665 if (EltType == MVT::i32 || EltType == MVT::f32)
13666 return AArch64ISD::DUPLANE32;
13667 if (EltType == MVT::i64 || EltType == MVT::f64)
13668 return AArch64ISD::DUPLANE64;
13669
13670 llvm_unreachable("Invalid vector element type?");
13671}
13672
13673static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
13674 unsigned Opcode, SelectionDAG &DAG) {
13675 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
13676 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13677 // Match: dup (bitcast (extract_subv X, C)), LaneC
13678 if (BitCast.getOpcode() != ISD::BITCAST ||
13680 return false;
13681
13682 // The extract index must align in the destination type. That may not
13683 // happen if the bitcast is from narrow to wide type.
13684 SDValue Extract = BitCast.getOperand(0);
13685 unsigned ExtIdx = Extract.getConstantOperandVal(1);
13686 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13687 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13688 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13689 if (ExtIdxInBits % CastedEltBitWidth != 0)
13690 return false;
13691
13692 // Can't handle cases where vector size is not 128-bit
13693 if (!Extract.getOperand(0).getValueType().is128BitVector())
13694 return false;
13695
13696 // Update the lane value by offsetting with the scaled extract index.
13697 LaneC += ExtIdxInBits / CastedEltBitWidth;
13698
13699 // Determine the casted vector type of the wide vector input.
13700 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13701 // Examples:
13702 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13703 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13704 unsigned SrcVecNumElts =
13705 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
13707 SrcVecNumElts);
13708 return true;
13709 };
13710 MVT CastVT;
13711 if (getScaledOffsetDup(V, Lane, CastVT)) {
13712 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
13713 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13714 V.getOperand(0).getValueType().is128BitVector()) {
13715 // The lane is incremented by the index of the extract.
13716 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13717 Lane += V.getConstantOperandVal(1);
13718 V = V.getOperand(0);
13719 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13720 // The lane is decremented if we are splatting from the 2nd operand.
13721 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13722 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
13723 Lane -= Idx * VT.getVectorNumElements() / 2;
13724 V = WidenVector(V.getOperand(Idx), DAG);
13725 } else if (VT.getSizeInBits() == 64) {
13726 // Widen the operand to 128-bit register with undef.
13727 V = WidenVector(V, DAG);
13728 }
13729 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
13730}
13731
13732// Try to widen element type to get a new mask value for a better permutation
13733// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13734// UZP1/2, TRN1/2, REV, INS, etc.
13735// For example:
13736// shufflevector <4 x i32> %a, <4 x i32> %b,
13737// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13738// is equivalent to:
13739// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13740// Finally, we can get:
13741// mov v0.d[0], v1.d[1]
13743 SDLoc DL(Op);
13744 EVT VT = Op.getValueType();
13745 EVT ScalarVT = VT.getVectorElementType();
13746 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13747 SDValue V0 = Op.getOperand(0);
13748 SDValue V1 = Op.getOperand(1);
13749 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13750
13751 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13752 // We need to make sure the wider element type is legal. Thus, ElementSize
13753 // should be not larger than 32 bits, and i1 type should also be excluded.
13754 if (ElementSize > 32 || ElementSize == 1)
13755 return SDValue();
13756
13757 SmallVector<int, 8> NewMask;
13758 if (widenShuffleMaskElts(Mask, NewMask)) {
13759 MVT NewEltVT = VT.isFloatingPoint()
13760 ? MVT::getFloatingPointVT(ElementSize * 2)
13761 : MVT::getIntegerVT(ElementSize * 2);
13762 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13763 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13764 V0 = DAG.getBitcast(NewVT, V0);
13765 V1 = DAG.getBitcast(NewVT, V1);
13766 return DAG.getBitcast(VT,
13767 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
13768 }
13769 }
13770
13771 return SDValue();
13772}
13773
13774// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13776 ArrayRef<int> ShuffleMask,
13777 SelectionDAG &DAG) {
13778 SDValue Tbl1 = Op->getOperand(0);
13779 SDValue Tbl2 = Op->getOperand(1);
13780 SDLoc dl(Op);
13781 SDValue Tbl2ID =
13782 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
13783
13784 EVT VT = Op.getValueType();
13785 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13786 Tbl1->getOperand(0) != Tbl2ID ||
13788 Tbl2->getOperand(0) != Tbl2ID)
13789 return SDValue();
13790
13791 if (Tbl1->getValueType(0) != MVT::v16i8 ||
13792 Tbl2->getValueType(0) != MVT::v16i8)
13793 return SDValue();
13794
13795 SDValue Mask1 = Tbl1->getOperand(3);
13796 SDValue Mask2 = Tbl2->getOperand(3);
13797 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13798 for (unsigned I = 0; I < 16; I++) {
13799 if (ShuffleMask[I] < 16)
13800 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
13801 else {
13802 auto *C =
13803 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
13804 if (!C)
13805 return SDValue();
13806 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
13807 }
13808 }
13809
13810 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
13811 SDValue ID =
13812 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
13813
13814 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
13815 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
13816 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
13817}
13818
13819// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13820// but we don't have an appropriate instruction,
13821// so custom-lower it as ZIP1-with-zeros.
13822SDValue
13823AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13824 SelectionDAG &DAG) const {
13825 SDLoc dl(Op);
13826 EVT VT = Op.getValueType();
13827 SDValue SrcOp = Op.getOperand(0);
13828 EVT SrcVT = SrcOp.getValueType();
13829 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13830 "Unexpected extension factor.");
13831 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13832 // FIXME: support multi-step zipping?
13833 if (Scale != 2)
13834 return SDValue();
13835 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
13836 return DAG.getBitcast(VT,
13837 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
13838}
13839
13840SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13841 SelectionDAG &DAG) const {
13842 SDLoc dl(Op);
13843 EVT VT = Op.getValueType();
13844
13845 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
13846
13847 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13848 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13849
13850 // Convert shuffles that are directly supported on NEON to target-specific
13851 // DAG nodes, instead of keeping them as shuffles and matching them again
13852 // during code selection. This is more efficient and avoids the possibility
13853 // of inconsistencies between legalization and selection.
13854 ArrayRef<int> ShuffleMask = SVN->getMask();
13855
13856 SDValue V1 = Op.getOperand(0);
13857 SDValue V2 = Op.getOperand(1);
13858
13859 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13860 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13861 "Unexpected VECTOR_SHUFFLE mask size!");
13862
13863 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13864 return Res;
13865
13866 if (SVN->isSplat()) {
13867 int Lane = SVN->getSplatIndex();
13868 // If this is undef splat, generate it via "just" vdup, if possible.
13869 if (Lane == -1)
13870 Lane = 0;
13871
13872 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13873 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
13874 V1.getOperand(0));
13875 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13876 // constant. If so, we can just reference the lane's definition directly.
13877 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13878 !isa<ConstantSDNode>(V1.getOperand(Lane)))
13879 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
13880
13881 // Otherwise, duplicate from the lane of the input vector.
13882 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
13883 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
13884 }
13885
13886 // Check if the mask matches a DUP for a wider element
13887 for (unsigned LaneSize : {64U, 32U, 16U}) {
13888 unsigned Lane = 0;
13889 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
13890 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13891 : LaneSize == 32 ? AArch64ISD::DUPLANE32
13893 // Cast V1 to an integer vector with required lane size
13894 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
13895 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13896 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
13897 V1 = DAG.getBitcast(NewVecTy, V1);
13898 // Constuct the DUP instruction
13899 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
13900 // Cast back to the original type
13901 return DAG.getBitcast(VT, V1);
13902 }
13903 }
13904
13905 unsigned NumElts = VT.getVectorNumElements();
13906 unsigned EltSize = VT.getScalarSizeInBits();
13907 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
13908 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1);
13909 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
13910 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1);
13911 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
13912 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1);
13913
13914 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13915 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
13916 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
13917 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
13918 DAG.getConstant(8, dl, MVT::i32));
13919 }
13920
13921 bool ReverseEXT = false;
13922 unsigned Imm;
13923 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
13924 if (ReverseEXT)
13925 std::swap(V1, V2);
13926 Imm *= getExtFactor(V1);
13927 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
13928 DAG.getConstant(Imm, dl, MVT::i32));
13929 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13930 Imm *= getExtFactor(V1);
13931 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13932 DAG.getConstant(Imm, dl, MVT::i32));
13933 }
13934
13935 unsigned WhichResult;
13936 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13937 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13938 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13939 }
13940 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13941 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13942 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13943 }
13944 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13945 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13946 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13947 }
13948
13949 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13950 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13951 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13952 }
13953 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13954 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13955 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13956 }
13957 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13958 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13959 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13960 }
13961
13963 return Concat;
13964
13965 bool DstIsLeft;
13966 int Anomaly;
13967 int NumInputElements = V1.getValueType().getVectorNumElements();
13968 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13969 SDValue DstVec = DstIsLeft ? V1 : V2;
13970 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13971
13972 SDValue SrcVec = V1;
13973 int SrcLane = ShuffleMask[Anomaly];
13974 if (SrcLane >= NumInputElements) {
13975 SrcVec = V2;
13976 SrcLane -= NumElts;
13977 }
13978 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13979
13980 EVT ScalarVT = VT.getVectorElementType();
13981
13982 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13983 ScalarVT = MVT::i32;
13984
13985 return DAG.getNode(
13986 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13987 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13988 DstLaneV);
13989 }
13990
13991 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13992 return NewSD;
13993
13994 // If the shuffle is not directly supported and it has 4 elements, use
13995 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13996 if (NumElts == 4) {
13997 unsigned PFIndexes[4];
13998 for (unsigned i = 0; i != 4; ++i) {
13999 if (ShuffleMask[i] < 0)
14000 PFIndexes[i] = 8;
14001 else
14002 PFIndexes[i] = ShuffleMask[i];
14003 }
14004
14005 // Compute the index in the perfect shuffle table.
14006 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14007 PFIndexes[2] * 9 + PFIndexes[3];
14008 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14009 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14010 dl);
14011 }
14012
14013 // Check for a "select shuffle", generating a BSL to pick between lanes in
14014 // V1/V2.
14015 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14016 assert(VT.getScalarSizeInBits() <= 32 &&
14017 "Expected larger vector element sizes to be handled already");
14018 SmallVector<SDValue> MaskElts;
14019 for (int M : ShuffleMask)
14020 MaskElts.push_back(DAG.getConstant(
14021 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
14023 SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
14024 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
14025 DAG.getBitcast(IVT, V1),
14026 DAG.getBitcast(IVT, V2)));
14027 }
14028
14029 // Fall back to generating a TBL
14030 return GenerateTBL(Op, ShuffleMask, DAG);
14031}
14032
14033SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14034 SelectionDAG &DAG) const {
14035 EVT VT = Op.getValueType();
14036
14037 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14038 return LowerToScalableOp(Op, DAG);
14039
14040 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14041 "Unexpected vector type!");
14042
14043 // We can handle the constant cases during isel.
14044 if (isa<ConstantSDNode>(Op.getOperand(0)))
14045 return Op;
14046
14047 // There isn't a natural way to handle the general i1 case, so we use some
14048 // trickery with whilelo.
14049 SDLoc DL(Op);
14050 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14051 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14052 DAG.getValueType(MVT::i1));
14053 SDValue ID =
14054 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14055 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14056 if (VT == MVT::nxv1i1)
14057 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14058 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14059 Zero, SplatVal),
14060 Zero);
14061 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14062}
14063
14064SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14065 SelectionDAG &DAG) const {
14066 SDLoc DL(Op);
14067
14068 EVT VT = Op.getValueType();
14069 if (!isTypeLegal(VT) || !VT.isScalableVector())
14070 return SDValue();
14071
14072 // Current lowering only supports the SVE-ACLE types.
14074 return SDValue();
14075
14076 // The DUPQ operation is independent of element type so normalise to i64s.
14077 SDValue Idx128 = Op.getOperand(2);
14078
14079 // DUPQ can be used when idx is in range.
14080 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14081 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14082 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14083 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14084 }
14085
14086 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14087
14088 // The ACLE says this must produce the same result as:
14089 // svtbl(data, svadd_x(svptrue_b64(),
14090 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14091 // index * 2))
14092 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14093 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14094
14095 // create the vector 0,1,0,1,...
14096 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14097 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14098
14099 // create the vector idx64,idx64+1,idx64,idx64+1,...
14100 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14101 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14102 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14103
14104 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14105 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14106 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14107}
14108
14109
14110static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14111 APInt &UndefBits) {
14112 EVT VT = BVN->getValueType(0);
14113 APInt SplatBits, SplatUndef;
14114 unsigned SplatBitSize;
14115 bool HasAnyUndefs;
14116 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14117 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14118
14119 for (unsigned i = 0; i < NumSplats; ++i) {
14120 CnstBits <<= SplatBitSize;
14121 UndefBits <<= SplatBitSize;
14122 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14123 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14124 }
14125
14126 return true;
14127 }
14128
14129 return false;
14130}
14131
14132// Try 64-bit splatted SIMD immediate.
14133static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14134 const APInt &Bits) {
14135 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14136 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14137 EVT VT = Op.getValueType();
14138 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14139
14142
14143 SDLoc dl(Op);
14144 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14145 DAG.getConstant(Value, dl, MVT::i32));
14146 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14147 }
14148 }
14149
14150 return SDValue();
14151}
14152
14153// Try 32-bit splatted SIMD immediate.
14154static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14155 const APInt &Bits,
14156 const SDValue *LHS = nullptr) {
14157 EVT VT = Op.getValueType();
14158 if (VT.isFixedLengthVector() &&
14160 return SDValue();
14161
14162 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14163 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14164 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14165 bool isAdvSIMDModImm = false;
14166 uint64_t Shift;
14167
14168 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14170 Shift = 0;
14171 }
14172 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14174 Shift = 8;
14175 }
14176 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14178 Shift = 16;
14179 }
14180 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14182 Shift = 24;
14183 }
14184
14185 if (isAdvSIMDModImm) {
14186 SDLoc dl(Op);
14187 SDValue Mov;
14188
14189 if (LHS)
14190 Mov = DAG.getNode(NewOp, dl, MovTy,
14191 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14192 DAG.getConstant(Value, dl, MVT::i32),
14193 DAG.getConstant(Shift, dl, MVT::i32));
14194 else
14195 Mov = DAG.getNode(NewOp, dl, MovTy,
14196 DAG.getConstant(Value, dl, MVT::i32),
14197 DAG.getConstant(Shift, dl, MVT::i32));
14198
14199 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14200 }
14201 }
14202
14203 return SDValue();
14204}
14205
14206// Try 16-bit splatted SIMD immediate.
14207static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14208 const APInt &Bits,
14209 const SDValue *LHS = nullptr) {
14210 EVT VT = Op.getValueType();
14211 if (VT.isFixedLengthVector() &&
14213 return SDValue();
14214
14215 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14216 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14217 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14218 bool isAdvSIMDModImm = false;
14219 uint64_t Shift;
14220
14221 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14223 Shift = 0;
14224 }
14225 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14227 Shift = 8;
14228 }
14229
14230 if (isAdvSIMDModImm) {
14231 SDLoc dl(Op);
14232 SDValue Mov;
14233
14234 if (LHS)
14235 Mov = DAG.getNode(NewOp, dl, MovTy,
14236 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14237 DAG.getConstant(Value, dl, MVT::i32),
14238 DAG.getConstant(Shift, dl, MVT::i32));
14239 else
14240 Mov = DAG.getNode(NewOp, dl, MovTy,
14241 DAG.getConstant(Value, dl, MVT::i32),
14242 DAG.getConstant(Shift, dl, MVT::i32));
14243
14244 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14245 }
14246 }
14247
14248 return SDValue();
14249}
14250
14251// Try 32-bit splatted SIMD immediate with shifted ones.
14253 SelectionDAG &DAG, const APInt &Bits) {
14254 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14255 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14256 EVT VT = Op.getValueType();
14257 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14258 bool isAdvSIMDModImm = false;
14259 uint64_t Shift;
14260
14261 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14263 Shift = 264;
14264 }
14265 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14267 Shift = 272;
14268 }
14269
14270 if (isAdvSIMDModImm) {
14271 SDLoc dl(Op);
14272 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14273 DAG.getConstant(Value, dl, MVT::i32),
14274 DAG.getConstant(Shift, dl, MVT::i32));
14275 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14276 }
14277 }
14278
14279 return SDValue();
14280}
14281
14282// Try 8-bit splatted SIMD immediate.
14283static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14284 const APInt &Bits) {
14285 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14286 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14287 EVT VT = Op.getValueType();
14288 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14289
14292
14293 SDLoc dl(Op);
14294 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14295 DAG.getConstant(Value, dl, MVT::i32));
14296 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14297 }
14298 }
14299
14300 return SDValue();
14301}
14302
14303// Try FP splatted SIMD immediate.
14304static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14305 const APInt &Bits) {
14306 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14307 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14308 EVT VT = Op.getValueType();
14309 bool isWide = (VT.getSizeInBits() == 128);
14310 MVT MovTy;
14311 bool isAdvSIMDModImm = false;
14312
14313 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14315 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14316 }
14317 else if (isWide &&
14318 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14320 MovTy = MVT::v2f64;
14321 }
14322
14323 if (isAdvSIMDModImm) {
14324 SDLoc dl(Op);
14325 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14326 DAG.getConstant(Value, dl, MVT::i32));
14327 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14328 }
14329 }
14330
14331 return SDValue();
14332}
14333
14334// Specialized code to quickly find if PotentialBVec is a BuildVector that
14335// consists of only the same constant int value, returned in reference arg
14336// ConstVal
14337static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14338 uint64_t &ConstVal) {
14339 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14340 if (!Bvec)
14341 return false;
14342 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14343 if (!FirstElt)
14344 return false;
14345 EVT VT = Bvec->getValueType(0);
14346 unsigned NumElts = VT.getVectorNumElements();
14347 for (unsigned i = 1; i < NumElts; ++i)
14348 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14349 return false;
14350 ConstVal = FirstElt->getZExtValue();
14351 return true;
14352}
14353
14355 // Look through cast.
14356 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14357 N = N.getOperand(0);
14358
14359 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14360}
14361
14363 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14364
14365 // Look through cast.
14366 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14367 N = N.getOperand(0);
14368 // When reinterpreting from a type with fewer elements the "new" elements
14369 // are not active, so bail if they're likely to be used.
14370 if (N.getValueType().getVectorMinNumElements() < NumElts)
14371 return false;
14372 }
14373
14374 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14375 return true;
14376
14377 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14378 // or smaller than the implicit element type represented by N.
14379 // NOTE: A larger element count implies a smaller element type.
14380 if (N.getOpcode() == AArch64ISD::PTRUE &&
14381 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14382 return N.getValueType().getVectorMinNumElements() >= NumElts;
14383
14384 // If we're compiling for a specific vector-length, we can check if the
14385 // pattern's VL equals that of the scalable vector at runtime.
14386 if (N.getOpcode() == AArch64ISD::PTRUE) {
14387 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14388 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14389 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14390 if (MaxSVESize && MinSVESize == MaxSVESize) {
14391 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14392 unsigned PatNumElts =
14393 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14394 return PatNumElts == (NumElts * VScale);
14395 }
14396 }
14397
14398 return false;
14399}
14400
14401// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14402// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14403// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14404// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14405// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14406// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14408 EVT VT = N->getValueType(0);
14409
14410 if (!VT.isVector())
14411 return SDValue();
14412
14413 SDLoc DL(N);
14414
14415 SDValue And;
14416 SDValue Shift;
14417
14418 SDValue FirstOp = N->getOperand(0);
14419 unsigned FirstOpc = FirstOp.getOpcode();
14420 SDValue SecondOp = N->getOperand(1);
14421 unsigned SecondOpc = SecondOp.getOpcode();
14422
14423 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14424 // a BICi in order to use an immediate instead of a register.
14425 // Is the other operand an shl or lshr? This will have been turned into:
14426 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14427 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14428 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14429 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14430 SecondOpc == AArch64ISD::SHL_PRED ||
14431 SecondOpc == AArch64ISD::SRL_PRED)) {
14432 And = FirstOp;
14433 Shift = SecondOp;
14434
14435 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14436 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14437 FirstOpc == AArch64ISD::SHL_PRED ||
14438 FirstOpc == AArch64ISD::SRL_PRED)) {
14439 And = SecondOp;
14440 Shift = FirstOp;
14441 } else
14442 return SDValue();
14443
14444 bool IsAnd = And.getOpcode() == ISD::AND;
14445 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14447 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14449
14450 // Is the shift amount constant and are all lanes active?
14451 uint64_t C2;
14452 if (ShiftHasPredOp) {
14453 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14454 return SDValue();
14455 APInt C;
14457 return SDValue();
14458 C2 = C.getZExtValue();
14459 } else if (ConstantSDNode *C2node =
14460 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14461 C2 = C2node->getZExtValue();
14462 else
14463 return SDValue();
14464
14465 APInt C1AsAPInt;
14466 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14467 if (IsAnd) {
14468 // Is the and mask vector all constant?
14469 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14470 return SDValue();
14471 } else {
14472 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14473 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14474 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14475 assert(C1nodeImm && C1nodeShift);
14476 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14477 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14478 }
14479
14480 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14481 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14482 // how much one can shift elements of a particular size?
14483 if (C2 > ElemSizeInBits)
14484 return SDValue();
14485
14486 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14487 : APInt::getLowBitsSet(ElemSizeInBits, C2);
14488 if (C1AsAPInt != RequiredC1)
14489 return SDValue();
14490
14491 SDValue X = And.getOperand(0);
14492 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14493 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14494 : Shift.getOperand(1);
14495
14496 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14497 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14498
14499 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
14500 LLVM_DEBUG(N->dump(&DAG));
14501 LLVM_DEBUG(dbgs() << "into: \n");
14502 LLVM_DEBUG(ResultSLI->dump(&DAG));
14503
14504 ++NumShiftInserts;
14505 return ResultSLI;
14506}
14507
14508SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14509 SelectionDAG &DAG) const {
14510 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14511 !Subtarget->isNeonAvailable()))
14512 return LowerToScalableOp(Op, DAG);
14513
14514 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14515 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
14516 return Res;
14517
14518 EVT VT = Op.getValueType();
14519 if (VT.isScalableVector())
14520 return Op;
14521
14522 SDValue LHS = Op.getOperand(0);
14523 BuildVectorSDNode *BVN =
14524 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
14525 if (!BVN) {
14526 // OR commutes, so try swapping the operands.
14527 LHS = Op.getOperand(1);
14528 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
14529 }
14530 if (!BVN)
14531 return Op;
14532
14533 APInt DefBits(VT.getSizeInBits(), 0);
14534 APInt UndefBits(VT.getSizeInBits(), 0);
14535 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14536 SDValue NewOp;
14537
14538 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14539 DefBits, &LHS)) ||
14540 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14541 DefBits, &LHS)))
14542 return NewOp;
14543
14544 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14545 UndefBits, &LHS)) ||
14546 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14547 UndefBits, &LHS)))
14548 return NewOp;
14549 }
14550
14551 // We can always fall back to a non-immediate OR.
14552 return Op;
14553}
14554
14555// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14556// be truncated to fit element width.
14558 SelectionDAG &DAG) {
14559 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14560 SDLoc dl(Op);
14561 EVT VT = Op.getValueType();
14562 EVT EltTy= VT.getVectorElementType();
14563
14564 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14565 return Op;
14566
14568 for (SDValue Lane : Op->ops()) {
14569 // For integer vectors, type legalization would have promoted the
14570 // operands already. Otherwise, if Op is a floating-point splat
14571 // (with operands cast to integers), then the only possibilities
14572 // are constants and UNDEFs.
14573 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
14574 Lane = DAG.getConstant(
14575 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
14576 dl, MVT::i32);
14577 } else if (Lane.getNode()->isUndef()) {
14578 Lane = DAG.getUNDEF(MVT::i32);
14579 } else {
14580 assert(Lane.getValueType() == MVT::i32 &&
14581 "Unexpected BUILD_VECTOR operand type");
14582 }
14583 Ops.push_back(Lane);
14584 }
14585 return DAG.getBuildVector(VT, dl, Ops);
14586}
14587
14589 const AArch64Subtarget *ST) {
14590 EVT VT = Op.getValueType();
14591 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
14592 "Expected a legal NEON vector");
14593
14594 APInt DefBits(VT.getSizeInBits(), 0);
14595 APInt UndefBits(VT.getSizeInBits(), 0);
14596 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14597 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14598 auto TryMOVIWithBits = [&](APInt DefBits) {
14599 SDValue NewOp;
14600 if ((NewOp =
14601 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
14602 (NewOp =
14603 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14604 (NewOp =
14605 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
14606 (NewOp =
14607 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14608 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
14609 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
14610 return NewOp;
14611
14612 APInt NotDefBits = ~DefBits;
14613 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
14614 NotDefBits)) ||
14616 NotDefBits)) ||
14617 (NewOp =
14618 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
14619 return NewOp;
14620 return SDValue();
14621 };
14622 if (SDValue R = TryMOVIWithBits(DefBits))
14623 return R;
14624 if (SDValue R = TryMOVIWithBits(UndefBits))
14625 return R;
14626
14627 // See if a fneg of the constant can be materialized with a MOVI, etc
14628 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14629 // FNegate each sub-element of the constant
14630 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
14631 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
14632 .zext(VT.getSizeInBits());
14633 APInt NegBits(VT.getSizeInBits(), 0);
14634 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14635 for (unsigned i = 0; i < NumElts; i++)
14636 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
14637 NegBits = DefBits ^ NegBits;
14638
14639 // Try to create the new constants with MOVI, and if so generate a fneg
14640 // for it.
14641 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
14642 SDLoc DL(Op);
14643 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
14644 return DAG.getNode(
14646 DAG.getNode(ISD::FNEG, DL, VFVT,
14647 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
14648 }
14649 return SDValue();
14650 };
14651 SDValue R;
14652 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
14653 (R = TryWithFNeg(DefBits, MVT::f64)) ||
14654 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
14655 return R;
14656 }
14657
14658 return SDValue();
14659}
14660
14661SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14662 SDValue Op, SelectionDAG &DAG) const {
14663 EVT VT = Op.getValueType();
14664 SDLoc DL(Op);
14665 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14666 auto *BVN = cast<BuildVectorSDNode>(Op);
14667
14668 if (auto SeqInfo = BVN->isConstantSequence()) {
14669 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
14670 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
14671 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
14672 return convertFromScalableVector(DAG, VT, Seq);
14673 }
14674
14675 unsigned NumElems = VT.getVectorNumElements();
14676 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
14677 NumElems <= 1 || BVN->isConstant())
14678 return SDValue();
14679
14680 auto IsExtractElt = [](SDValue Op) {
14681 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14682 };
14683
14684 // For integer types that are not already in vectors limit to at most four
14685 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14686 if (VT.getScalarType().isInteger() &&
14687 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
14688 return SDValue();
14689
14690 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14691 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
14692 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
14693 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
14694 return Op.isUndef() ? Undef
14695 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
14696 ContainerVT, Undef, Op, ZeroI64);
14697 });
14698
14699 ElementCount ZipEC = ContainerVT.getVectorElementCount();
14700 while (Intermediates.size() > 1) {
14701 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
14702
14703 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
14704 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
14705 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
14706 Intermediates[I / 2] =
14707 Op1.isUndef() ? Op0
14708 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
14709 }
14710
14711 Intermediates.resize(Intermediates.size() / 2);
14712 ZipEC = ZipEC.divideCoefficientBy(2);
14713 }
14714
14715 assert(Intermediates.size() == 1);
14716 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
14717 return convertFromScalableVector(DAG, VT, Vec);
14718}
14719
14720SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14721 SelectionDAG &DAG) const {
14722 EVT VT = Op.getValueType();
14723
14724 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14725 cast<BuildVectorSDNode>(Op)->isConstantSequence();
14726 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14727 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14728
14729 // Try to build a simple constant vector.
14730 Op = NormalizeBuildVector(Op, DAG);
14731 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14732 // abort.
14733 if (Op.getOpcode() != ISD::BUILD_VECTOR)
14734 return SDValue();
14735
14736 // Certain vector constants, used to express things like logical NOT and
14737 // arithmetic NEG, are passed through unmodified. This allows special
14738 // patterns for these operations to match, which will lower these constants
14739 // to whatever is proven necessary.
14740 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14741 if (BVN->isConstant()) {
14742 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14743 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14744 APInt Val(BitSize,
14745 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
14746 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
14747 return Op;
14748 }
14749 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14750 if (Const->isZero() && !Const->isNegative())
14751 return Op;
14752 }
14753
14754 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
14755 return V;
14756
14757 // Scan through the operands to find some interesting properties we can
14758 // exploit:
14759 // 1) If only one value is used, we can use a DUP, or
14760 // 2) if only the low element is not undef, we can just insert that, or
14761 // 3) if only one constant value is used (w/ some non-constant lanes),
14762 // we can splat the constant value into the whole vector then fill
14763 // in the non-constant lanes.
14764 // 4) FIXME: If different constant values are used, but we can intelligently
14765 // select the values we'll be overwriting for the non-constant
14766 // lanes such that we can directly materialize the vector
14767 // some other way (MOVI, e.g.), we can be sneaky.
14768 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14769 SDLoc dl(Op);
14770 unsigned NumElts = VT.getVectorNumElements();
14771 bool isOnlyLowElement = true;
14772 bool usesOnlyOneValue = true;
14773 bool usesOnlyOneConstantValue = true;
14774 bool isConstant = true;
14775 bool AllLanesExtractElt = true;
14776 unsigned NumConstantLanes = 0;
14777 unsigned NumDifferentLanes = 0;
14778 unsigned NumUndefLanes = 0;
14779 SDValue Value;
14780 SDValue ConstantValue;
14781 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14782 unsigned ConsecutiveValCount = 0;
14783 SDValue PrevVal;
14784 for (unsigned i = 0; i < NumElts; ++i) {
14785 SDValue V = Op.getOperand(i);
14786 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14787 AllLanesExtractElt = false;
14788 if (V.isUndef()) {
14789 ++NumUndefLanes;
14790 continue;
14791 }
14792 if (i > 0)
14793 isOnlyLowElement = false;
14794 if (!isIntOrFPConstant(V))
14795 isConstant = false;
14796
14797 if (isIntOrFPConstant(V)) {
14798 ++NumConstantLanes;
14799 if (!ConstantValue.getNode())
14800 ConstantValue = V;
14801 else if (ConstantValue != V)
14802 usesOnlyOneConstantValue = false;
14803 }
14804
14805 if (!Value.getNode())
14806 Value = V;
14807 else if (V != Value) {
14808 usesOnlyOneValue = false;
14809 ++NumDifferentLanes;
14810 }
14811
14812 if (PrevVal != V) {
14813 ConsecutiveValCount = 0;
14814 PrevVal = V;
14815 }
14816
14817 // Keep different values and its last consecutive count. For example,
14818 //
14819 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14820 // t24, t24, t24, t24, t24, t24, t24, t24
14821 // t23 = consecutive count 8
14822 // t24 = consecutive count 8
14823 // ------------------------------------------------------------------
14824 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14825 // t24, t24, t24, t24, t24, t24, t24, t24
14826 // t23 = consecutive count 5
14827 // t24 = consecutive count 9
14828 DifferentValueMap[V] = ++ConsecutiveValCount;
14829 }
14830
14831 if (!Value.getNode()) {
14832 LLVM_DEBUG(
14833 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14834 return DAG.getUNDEF(VT);
14835 }
14836
14837 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14838 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14839 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14840 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
14841 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14842 "SCALAR_TO_VECTOR node\n");
14843 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
14844 }
14845
14846 if (AllLanesExtractElt) {
14847 SDNode *Vector = nullptr;
14848 bool Even = false;
14849 bool Odd = false;
14850 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14851 // the Odd pattern <1,3,5,...>.
14852 for (unsigned i = 0; i < NumElts; ++i) {
14853 SDValue V = Op.getOperand(i);
14854 const SDNode *N = V.getNode();
14855 if (!isa<ConstantSDNode>(N->getOperand(1))) {
14856 Even = false;
14857 Odd = false;
14858 break;
14859 }
14860 SDValue N0 = N->getOperand(0);
14861
14862 // All elements are extracted from the same vector.
14863 if (!Vector) {
14864 Vector = N0.getNode();
14865 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14866 // BUILD_VECTOR.
14867 if (VT.getVectorElementType() !=
14869 break;
14870 } else if (Vector != N0.getNode()) {
14871 Odd = false;
14872 Even = false;
14873 break;
14874 }
14875
14876 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14877 // indices <1,3,5,...>.
14878 uint64_t Val = N->getConstantOperandVal(1);
14879 if (Val == 2 * i) {
14880 Even = true;
14881 continue;
14882 }
14883 if (Val - 1 == 2 * i) {
14884 Odd = true;
14885 continue;
14886 }
14887
14888 // Something does not match: abort.
14889 Odd = false;
14890 Even = false;
14891 break;
14892 }
14893 if (Even || Odd) {
14894 SDValue LHS =
14896 DAG.getConstant(0, dl, MVT::i64));
14897 SDValue RHS =
14899 DAG.getConstant(NumElts, dl, MVT::i64));
14900
14901 if (Even && !Odd)
14902 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
14903 if (Odd && !Even)
14904 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
14905 }
14906 }
14907
14908 // Use DUP for non-constant splats. For f32 constant splats, reduce to
14909 // i32 and try again.
14910 if (usesOnlyOneValue) {
14911 if (!isConstant) {
14912 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14913 Value.getValueType() != VT) {
14914 LLVM_DEBUG(
14915 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14916 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
14917 }
14918
14919 // This is actually a DUPLANExx operation, which keeps everything vectory.
14920
14921 SDValue Lane = Value.getOperand(1);
14922 Value = Value.getOperand(0);
14923 if (Value.getValueSizeInBits() == 64) {
14924 LLVM_DEBUG(
14925 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14926 "widening it\n");
14927 Value = WidenVector(Value, DAG);
14928 }
14929
14930 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
14931 return DAG.getNode(Opcode, dl, VT, Value, Lane);
14932 }
14933
14936 EVT EltTy = VT.getVectorElementType();
14937 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14938 EltTy == MVT::f64) && "Unsupported floating-point vector type");
14939 LLVM_DEBUG(
14940 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14941 "BITCASTS, and try again\n");
14942 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
14943 for (unsigned i = 0; i < NumElts; ++i)
14944 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
14945 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
14946 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
14947 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14948 Val.dump(););
14949 Val = LowerBUILD_VECTOR(Val, DAG);
14950 if (Val.getNode())
14951 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
14952 }
14953 }
14954
14955 // If we need to insert a small number of different non-constant elements and
14956 // the vector width is sufficiently large, prefer using DUP with the common
14957 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14958 // skip the constant lane handling below.
14959 bool PreferDUPAndInsert =
14960 !isConstant && NumDifferentLanes >= 1 &&
14961 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14962 NumDifferentLanes >= NumConstantLanes;
14963
14964 // If there was only one constant value used and for more than one lane,
14965 // start by splatting that value, then replace the non-constant lanes. This
14966 // is better than the default, which will perform a separate initialization
14967 // for each lane.
14968 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
14969 // Firstly, try to materialize the splat constant.
14970 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
14971 unsigned BitSize = VT.getScalarSizeInBits();
14972 APInt ConstantValueAPInt(1, 0);
14973 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
14974 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
14975 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
14976 !ConstantValueAPInt.isAllOnes()) {
14977 Val = ConstantBuildVector(Val, DAG, Subtarget);
14978 if (!Val)
14979 // Otherwise, materialize the constant and splat it.
14980 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
14981 }
14982
14983 // Now insert the non-constant lanes.
14984 for (unsigned i = 0; i < NumElts; ++i) {
14985 SDValue V = Op.getOperand(i);
14986 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14987 if (!isIntOrFPConstant(V))
14988 // Note that type legalization likely mucked about with the VT of the
14989 // source operand, so we may have to convert it here before inserting.
14990 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
14991 }
14992 return Val;
14993 }
14994
14995 // This will generate a load from the constant pool.
14996 if (isConstant) {
14997 LLVM_DEBUG(
14998 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14999 "expansion\n");
15000 return SDValue();
15001 }
15002
15003 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15004 // v4i32s. This is really a truncate, which we can construct out of (legal)
15005 // concats and truncate nodes.
15007 return M;
15008
15009 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15010 if (NumElts >= 4) {
15011 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15012 return Shuffle;
15013
15014 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15015 return Shuffle;
15016 }
15017
15018 if (PreferDUPAndInsert) {
15019 // First, build a constant vector with the common element.
15020 SmallVector<SDValue, 8> Ops(NumElts, Value);
15021 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
15022 // Next, insert the elements that do not match the common value.
15023 for (unsigned I = 0; I < NumElts; ++I)
15024 if (Op.getOperand(I) != Value)
15025 NewVector =
15026 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
15027 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
15028
15029 return NewVector;
15030 }
15031
15032 // If vector consists of two different values, try to generate two DUPs and
15033 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15034 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15036 // Check the consecutive count of the value is the half number of vector
15037 // elements. In this case, we can use CONCAT_VECTORS. For example,
15038 //
15039 // canUseVECTOR_CONCAT = true;
15040 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15041 // t24, t24, t24, t24, t24, t24, t24, t24
15042 //
15043 // canUseVECTOR_CONCAT = false;
15044 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15045 // t24, t24, t24, t24, t24, t24, t24, t24
15046 bool canUseVECTOR_CONCAT = true;
15047 for (auto Pair : DifferentValueMap) {
15048 // Check different values have same length which is NumElts / 2.
15049 if (Pair.second != NumElts / 2)
15050 canUseVECTOR_CONCAT = false;
15051 Vals.push_back(Pair.first);
15052 }
15053
15054 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15055 // CONCAT_VECTORs. For example,
15056 //
15057 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15058 // t24, t24, t24, t24, t24, t24, t24, t24
15059 // ==>
15060 // t26: v8i8 = AArch64ISD::DUP t23
15061 // t28: v8i8 = AArch64ISD::DUP t24
15062 // t29: v16i8 = concat_vectors t26, t28
15063 if (canUseVECTOR_CONCAT) {
15064 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15065 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15066 SubVT.getVectorNumElements() >= 2) {
15067 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15068 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15069 SDValue DUP1 =
15070 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
15071 SDValue DUP2 =
15072 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
15074 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
15075 return CONCAT_VECTORS;
15076 }
15077 }
15078
15079 // Let's try to generate VECTOR_SHUFFLE. For example,
15080 //
15081 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15082 // ==>
15083 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15084 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15085 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15086 if (NumElts >= 8) {
15087 SmallVector<int, 16> MaskVec;
15088 // Build mask for VECTOR_SHUFLLE.
15089 SDValue FirstLaneVal = Op.getOperand(0);
15090 for (unsigned i = 0; i < NumElts; ++i) {
15091 SDValue Val = Op.getOperand(i);
15092 if (FirstLaneVal == Val)
15093 MaskVec.push_back(i);
15094 else
15095 MaskVec.push_back(i + NumElts);
15096 }
15097
15098 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15099 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15100 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
15101 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
15103 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
15104 return VECTOR_SHUFFLE;
15105 }
15106 }
15107
15108 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15109 // know the default expansion would otherwise fall back on something even
15110 // worse. For a vector with one or two non-undef values, that's
15111 // scalar_to_vector for the elements followed by a shuffle (provided the
15112 // shuffle is valid for the target) and materialization element by element
15113 // on the stack followed by a load for everything else.
15114 if (!isConstant && !usesOnlyOneValue) {
15115 LLVM_DEBUG(
15116 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15117 "of INSERT_VECTOR_ELT\n");
15118
15119 SDValue Vec = DAG.getUNDEF(VT);
15120 SDValue Op0 = Op.getOperand(0);
15121 unsigned i = 0;
15122
15123 // Use SCALAR_TO_VECTOR for lane zero to
15124 // a) Avoid a RMW dependency on the full vector register, and
15125 // b) Allow the register coalescer to fold away the copy if the
15126 // value is already in an S or D register, and we're forced to emit an
15127 // INSERT_SUBREG that we can't fold anywhere.
15128 //
15129 // We also allow types like i8 and i16 which are illegal scalar but legal
15130 // vector element types. After type-legalization the inserted value is
15131 // extended (i32) and it is safe to cast them to the vector type by ignoring
15132 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15133 if (!Op0.isUndef()) {
15134 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15135 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
15136 ++i;
15137 }
15138 LLVM_DEBUG({
15139 if (i < NumElts)
15140 dbgs() << "Creating nodes for the other vector elements:\n";
15141 });
15142 for (; i < NumElts; ++i) {
15143 SDValue V = Op.getOperand(i);
15144 if (V.isUndef())
15145 continue;
15146 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
15147 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
15148 }
15149 return Vec;
15150 }
15151
15152 LLVM_DEBUG(
15153 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15154 "better alternative\n");
15155 return SDValue();
15156}
15157
15158SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15159 SelectionDAG &DAG) const {
15160 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15161 !Subtarget->isNeonAvailable()))
15162 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15163
15164 assert(Op.getValueType().isScalableVector() &&
15165 isTypeLegal(Op.getValueType()) &&
15166 "Expected legal scalable vector type!");
15167
15168 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15169 unsigned NumOperands = Op->getNumOperands();
15170 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15171 "Unexpected number of operands in CONCAT_VECTORS");
15172
15173 if (NumOperands == 2)
15174 return Op;
15175
15176 // Concat each pair of subvectors and pack into the lower half of the array.
15177 SmallVector<SDValue> ConcatOps(Op->ops());
15178 while (ConcatOps.size() > 1) {
15179 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15180 SDValue V1 = ConcatOps[I];
15181 SDValue V2 = ConcatOps[I + 1];
15182 EVT SubVT = V1.getValueType();
15183 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15184 ConcatOps[I / 2] =
15185 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15186 }
15187 ConcatOps.resize(ConcatOps.size() / 2);
15188 }
15189 return ConcatOps[0];
15190 }
15191
15192 return SDValue();
15193}
15194
15195SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15196 SelectionDAG &DAG) const {
15197 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15198
15199 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15200 !Subtarget->isNeonAvailable()))
15201 return LowerFixedLengthInsertVectorElt(Op, DAG);
15202
15203 EVT VT = Op.getOperand(0).getValueType();
15204
15205 if (VT.getScalarType() == MVT::i1) {
15206 EVT VectorVT = getPromotedVTForPredicate(VT);
15207 SDLoc DL(Op);
15208 SDValue ExtendedVector =
15209 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15210 SDValue ExtendedValue =
15211 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15212 VectorVT.getScalarType().getSizeInBits() < 32
15213 ? MVT::i32
15214 : VectorVT.getScalarType());
15215 ExtendedVector =
15216 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15217 ExtendedValue, Op.getOperand(2));
15218 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15219 }
15220
15221 // Check for non-constant or out of range lane.
15222 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15223 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15224 return SDValue();
15225
15226 return Op;
15227}
15228
15229SDValue
15230AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15231 SelectionDAG &DAG) const {
15232 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15233 EVT VT = Op.getOperand(0).getValueType();
15234
15235 if (VT.getScalarType() == MVT::i1) {
15236 // We can't directly extract from an SVE predicate; extend it first.
15237 // (This isn't the only possible lowering, but it's straightforward.)
15238 EVT VectorVT = getPromotedVTForPredicate(VT);
15239 SDLoc DL(Op);
15240 SDValue Extend =
15241 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15242 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15243 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15244 Extend, Op.getOperand(1));
15245 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15246 }
15247
15248 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15249 return LowerFixedLengthExtractVectorElt(Op, DAG);
15250
15251 // Check for non-constant or out of range lane.
15252 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15253 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15254 return SDValue();
15255
15256 // Insertion/extraction are legal for V128 types.
15257 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15258 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15259 VT == MVT::v8f16 || VT == MVT::v8bf16)
15260 return Op;
15261
15262 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15263 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15264 VT != MVT::v4bf16)
15265 return SDValue();
15266
15267 // For V64 types, we perform extraction by expanding the value
15268 // to a V128 type and perform the extraction on that.
15269 SDLoc DL(Op);
15270 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15271 EVT WideTy = WideVec.getValueType();
15272
15273 EVT ExtrTy = WideTy.getVectorElementType();
15274 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15275 ExtrTy = MVT::i32;
15276
15277 // For extractions, we just return the result directly.
15278 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15279 Op.getOperand(1));
15280}
15281
15282SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15283 SelectionDAG &DAG) const {
15284 EVT VT = Op.getValueType();
15286 "Only cases that extract a fixed length vector are supported!");
15287 EVT InVT = Op.getOperand(0).getValueType();
15288
15289 // If we don't have legal types yet, do nothing
15290 if (!isTypeLegal(InVT))
15291 return SDValue();
15292
15293 if (InVT.is128BitVector()) {
15294 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15295 unsigned Idx = Op.getConstantOperandVal(1);
15296
15297 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15298 if (Idx == 0)
15299 return Op;
15300
15301 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15302 // that directly.
15303 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15304 return Op;
15305 }
15306
15307 if (InVT.isScalableVector() ||
15308 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15309 SDLoc DL(Op);
15310 SDValue Vec = Op.getOperand(0);
15311 SDValue Idx = Op.getOperand(1);
15312
15314 if (PackedVT != InVT) {
15315 // Pack input into the bottom part of an SVE register and try again.
15316 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15317 DAG.getUNDEF(PackedVT), Vec,
15318 DAG.getVectorIdxConstant(0, DL));
15319 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15320 }
15321
15322 // This will get matched by custom code during ISelDAGToDAG.
15323 if (isNullConstant(Idx))
15324 return Op;
15325
15326 assert(InVT.isScalableVector() && "Unexpected vector type!");
15327 // Move requested subvector to the start of the vector and try again.
15328 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15329 return convertFromScalableVector(DAG, VT, Splice);
15330 }
15331
15332 return SDValue();
15333}
15334
15335SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15336 SelectionDAG &DAG) const {
15337 assert(Op.getValueType().isScalableVector() &&
15338 "Only expect to lower inserts into scalable vectors!");
15339
15340 EVT InVT = Op.getOperand(1).getValueType();
15341 unsigned Idx = Op.getConstantOperandVal(2);
15342
15343 SDValue Vec0 = Op.getOperand(0);
15344 SDValue Vec1 = Op.getOperand(1);
15345 SDLoc DL(Op);
15346 EVT VT = Op.getValueType();
15347
15348 if (InVT.isScalableVector()) {
15349 if (!isTypeLegal(VT))
15350 return SDValue();
15351
15352 // Break down insert_subvector into simpler parts.
15353 if (VT.getVectorElementType() == MVT::i1) {
15354 unsigned NumElts = VT.getVectorMinNumElements();
15355 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15356
15357 SDValue Lo, Hi;
15358 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15359 DAG.getVectorIdxConstant(0, DL));
15360 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15361 DAG.getVectorIdxConstant(NumElts / 2, DL));
15362 if (Idx < (NumElts / 2))
15363 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15365 else
15366 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15367 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15368
15369 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15370 }
15371
15372 // We can select these directly.
15373 if (isTypeLegal(InVT) && Vec0.isUndef())
15374 return Op;
15375
15376 // Ensure the subvector is half the size of the main vector.
15377 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15378 return SDValue();
15379
15380 // Here narrow and wide refers to the vector element types. After "casting"
15381 // both vectors must have the same bit length and so because the subvector
15382 // has fewer elements, those elements need to be bigger.
15385
15386 // NOP cast operands to the largest legal vector of the same element count.
15387 if (VT.isFloatingPoint()) {
15388 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15389 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15390 } else {
15391 // Legal integer vectors are already their largest so Vec0 is fine as is.
15392 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15393 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15394 }
15395
15396 // To replace the top/bottom half of vector V with vector SubV we widen the
15397 // preserved half of V, concatenate this to SubV (the order depending on the
15398 // half being replaced) and then narrow the result.
15399 SDValue Narrow;
15400 if (Idx == 0) {
15401 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15402 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15403 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15404 } else {
15406 "Invalid subvector index!");
15407 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15408 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15409 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15410 }
15411
15412 return getSVESafeBitCast(VT, Narrow, DAG);
15413 }
15414
15415 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15416 // This will be matched by custom code during ISelDAGToDAG.
15417 if (Vec0.isUndef())
15418 return Op;
15419
15420 std::optional<unsigned> PredPattern =
15422 auto PredTy = VT.changeVectorElementType(MVT::i1);
15423 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15424 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15425 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15426 }
15427
15428 return SDValue();
15429}
15430
15431static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15432 if (Op.getOpcode() != AArch64ISD::DUP &&
15433 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15434 Op.getOpcode() != ISD::BUILD_VECTOR)
15435 return false;
15436
15437 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15438 !isAllConstantBuildVector(Op, SplatVal))
15439 return false;
15440
15441 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15442 !isa<ConstantSDNode>(Op->getOperand(0)))
15443 return false;
15444
15445 SplatVal = Op->getConstantOperandVal(0);
15446 if (Op.getValueType().getVectorElementType() != MVT::i64)
15447 SplatVal = (int32_t)SplatVal;
15448
15449 Negated = false;
15450 if (isPowerOf2_64(SplatVal))
15451 return true;
15452
15453 Negated = true;
15454 if (isPowerOf2_64(-SplatVal)) {
15455 SplatVal = -SplatVal;
15456 return true;
15457 }
15458
15459 return false;
15460}
15461
15462SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15463 EVT VT = Op.getValueType();
15464 SDLoc dl(Op);
15465
15466 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15467 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15468
15469 assert(VT.isScalableVector() && "Expected a scalable vector.");
15470
15471 bool Signed = Op.getOpcode() == ISD::SDIV;
15472 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15473
15474 bool Negated;
15475 uint64_t SplatVal;
15476 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
15477 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
15478 SDValue Res =
15479 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
15480 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
15481 if (Negated)
15482 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
15483
15484 return Res;
15485 }
15486
15487 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15488 return LowerToPredicatedOp(Op, DAG, PredOpcode);
15489
15490 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15491 // operations, and truncate the result.
15492 EVT WidenedVT;
15493 if (VT == MVT::nxv16i8)
15494 WidenedVT = MVT::nxv8i16;
15495 else if (VT == MVT::nxv8i16)
15496 WidenedVT = MVT::nxv4i32;
15497 else
15498 llvm_unreachable("Unexpected Custom DIV operation");
15499
15500 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15501 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15502 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
15503 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
15504 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
15505 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
15506 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
15507 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
15508 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo);
15509 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi);
15510 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast);
15511}
15512
15513bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15514 EVT VT, unsigned DefinedValues) const {
15515 if (!Subtarget->isNeonAvailable())
15516 return false;
15518}
15519
15521 // Currently no fixed length shuffles that require SVE are legal.
15522 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15523 return false;
15524
15525 if (VT.getVectorNumElements() == 4 &&
15526 (VT.is128BitVector() || VT.is64BitVector())) {
15527 unsigned Cost = getPerfectShuffleCost(M);
15528 if (Cost <= 1)
15529 return true;
15530 }
15531
15532 bool DummyBool;
15533 int DummyInt;
15534 unsigned DummyUnsigned;
15535
15536 unsigned EltSize = VT.getScalarSizeInBits();
15537 unsigned NumElts = VT.getVectorNumElements();
15538 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
15539 isREVMask(M, EltSize, NumElts, 64) ||
15540 isREVMask(M, EltSize, NumElts, 32) ||
15541 isREVMask(M, EltSize, NumElts, 16) ||
15542 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
15543 isTRNMask(M, NumElts, DummyUnsigned) ||
15544 isUZPMask(M, NumElts, DummyUnsigned) ||
15545 isZIPMask(M, NumElts, DummyUnsigned) ||
15546 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
15547 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
15548 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
15549 isINSMask(M, NumElts, DummyBool, DummyInt) ||
15550 isConcatMask(M, VT, VT.getSizeInBits() == 128));
15551}
15552
15554 EVT VT) const {
15555 // Just delegate to the generic legality, clear masks aren't special.
15556 return isShuffleMaskLegal(M, VT);
15557}
15558
15559/// getVShiftImm - Check if this is a valid build_vector for the immediate
15560/// operand of a vector shift operation, where all the elements of the
15561/// build_vector must have the same constant integer value.
15562static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15563 // Ignore bit_converts.
15564 while (Op.getOpcode() == ISD::BITCAST)
15565 Op = Op.getOperand(0);
15566 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
15567 APInt SplatBits, SplatUndef;
15568 unsigned SplatBitSize;
15569 bool HasAnyUndefs;
15570 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
15571 HasAnyUndefs, ElementBits) ||
15572 SplatBitSize > ElementBits)
15573 return false;
15574 Cnt = SplatBits.getSExtValue();
15575 return true;
15576}
15577
15578/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15579/// operand of a vector shift left operation. That value must be in the range:
15580/// 0 <= Value < ElementBits for a left shift; or
15581/// 0 <= Value <= ElementBits for a long left shift.
15582static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15583 assert(VT.isVector() && "vector shift count is not a vector type");
15584 int64_t ElementBits = VT.getScalarSizeInBits();
15585 if (!getVShiftImm(Op, ElementBits, Cnt))
15586 return false;
15587 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
15588}
15589
15590/// isVShiftRImm - Check if this is a valid build_vector for the immediate
15591/// operand of a vector shift right operation. The value must be in the range:
15592/// 1 <= Value <= ElementBits for a right shift; or
15593static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15594 assert(VT.isVector() && "vector shift count is not a vector type");
15595 int64_t ElementBits = VT.getScalarSizeInBits();
15596 if (!getVShiftImm(Op, ElementBits, Cnt))
15597 return false;
15598 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
15599}
15600
15601SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15602 SelectionDAG &DAG) const {
15603 EVT VT = Op.getValueType();
15604
15605 if (VT.getScalarType() == MVT::i1) {
15606 // Lower i1 truncate to `(x & 1) != 0`.
15607 SDLoc dl(Op);
15608 EVT OpVT = Op.getOperand(0).getValueType();
15609 SDValue Zero = DAG.getConstant(0, dl, OpVT);
15610 SDValue One = DAG.getConstant(1, dl, OpVT);
15611 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
15612 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
15613 }
15614
15615 if (!VT.isVector() || VT.isScalableVector())
15616 return SDValue();
15617
15618 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15619 !Subtarget->isNeonAvailable()))
15620 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15621
15622 return SDValue();
15623}
15624
15625// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15626// possibly a truncated type, it tells how many bits of the value are to be
15627// used.
15629 SelectionDAG &DAG,
15630 unsigned &ShiftValue,
15631 SDValue &RShOperand) {
15632 if (Shift->getOpcode() != ISD::SRL)
15633 return false;
15634
15635 EVT VT = Shift.getValueType();
15636 assert(VT.isScalableVT());
15637
15638 auto ShiftOp1 =
15639 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
15640 if (!ShiftOp1)
15641 return false;
15642
15643 ShiftValue = ShiftOp1->getZExtValue();
15644 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
15645 return false;
15646
15647 SDValue Add = Shift->getOperand(0);
15648 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
15649 return false;
15650
15652 "ResVT must be truncated or same type as the shift.");
15653 // Check if an overflow can lead to incorrect results.
15654 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15655 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
15656 return false;
15657
15658 auto AddOp1 =
15659 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
15660 if (!AddOp1)
15661 return false;
15662 uint64_t AddValue = AddOp1->getZExtValue();
15663 if (AddValue != 1ULL << (ShiftValue - 1))
15664 return false;
15665
15666 RShOperand = Add->getOperand(0);
15667 return true;
15668}
15669
15670SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15671 SelectionDAG &DAG) const {
15672 EVT VT = Op.getValueType();
15673 SDLoc DL(Op);
15674 int64_t Cnt;
15675
15676 if (!Op.getOperand(1).getValueType().isVector())
15677 return Op;
15678 unsigned EltSize = VT.getScalarSizeInBits();
15679
15680 switch (Op.getOpcode()) {
15681 case ISD::SHL:
15682 if (VT.isScalableVector() ||
15684 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
15685
15686 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
15687 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
15688 DAG.getConstant(Cnt, DL, MVT::i32));
15689 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
15690 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
15691 MVT::i32),
15692 Op.getOperand(0), Op.getOperand(1));
15693 case ISD::SRA:
15694 case ISD::SRL:
15695 if (VT.isScalableVector() &&
15696 (Subtarget->hasSVE2() ||
15697 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
15698 SDValue RShOperand;
15699 unsigned ShiftValue;
15700 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
15701 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
15702 getPredicateForVector(DAG, DL, VT), RShOperand,
15703 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
15704 }
15705
15706 if (VT.isScalableVector() ||
15707 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
15708 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15710 return LowerToPredicatedOp(Op, DAG, Opc);
15711 }
15712
15713 // Right shift immediate
15714 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
15715 unsigned Opc =
15716 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15717 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
15718 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
15719 }
15720
15721 // Right shift register. Note, there is not a shift right register
15722 // instruction, but the shift left register instruction takes a signed
15723 // value, where negative numbers specify a right shift.
15724 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15725 : Intrinsic::aarch64_neon_ushl;
15726 // negate the shift amount
15727 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
15728 Op.getOperand(1));
15729 SDValue NegShiftLeft =
15731 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
15732 NegShift);
15733 return NegShiftLeft;
15734 }
15735
15736 llvm_unreachable("unexpected shift opcode");
15737}
15738
15740 AArch64CC::CondCode CC, bool NoNans, EVT VT,
15741 const SDLoc &dl, SelectionDAG &DAG) {
15742 EVT SrcVT = LHS.getValueType();
15743 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15744 "function only supposed to emit natural comparisons");
15745
15746 APInt SplatValue;
15747 APInt SplatUndef;
15748 unsigned SplatBitSize = 0;
15749 bool HasAnyUndefs;
15750
15751 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15752 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
15753 SplatBitSize, HasAnyUndefs);
15754
15755 bool IsZero = IsCnst && SplatValue == 0;
15756 bool IsOne =
15757 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
15758 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
15759
15760 if (SrcVT.getVectorElementType().isFloatingPoint()) {
15761 switch (CC) {
15762 default:
15763 return SDValue();
15764 case AArch64CC::NE: {
15765 SDValue Fcmeq;
15766 if (IsZero)
15767 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15768 else
15769 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15770 return DAG.getNOT(dl, Fcmeq, VT);
15771 }
15772 case AArch64CC::EQ:
15773 if (IsZero)
15774 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15775 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15776 case AArch64CC::GE:
15777 if (IsZero)
15778 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
15779 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15780 case AArch64CC::GT:
15781 if (IsZero)
15782 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
15783 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15784 case AArch64CC::LE:
15785 if (!NoNans)
15786 return SDValue();
15787 // If we ignore NaNs then we can use to the LS implementation.
15788 [[fallthrough]];
15789 case AArch64CC::LS:
15790 if (IsZero)
15791 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
15792 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15793 case AArch64CC::LT:
15794 if (!NoNans)
15795 return SDValue();
15796 // If we ignore NaNs then we can use to the MI implementation.
15797 [[fallthrough]];
15798 case AArch64CC::MI:
15799 if (IsZero)
15800 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
15801 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15802 }
15803 }
15804
15805 switch (CC) {
15806 default:
15807 return SDValue();
15808 case AArch64CC::NE: {
15809 SDValue Cmeq;
15810 if (IsZero)
15811 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15812 else
15813 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15814 return DAG.getNOT(dl, Cmeq, VT);
15815 }
15816 case AArch64CC::EQ:
15817 if (IsZero)
15818 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15819 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15820 case AArch64CC::GE:
15821 if (IsZero)
15822 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15823 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
15824 case AArch64CC::GT:
15825 if (IsZero)
15826 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
15827 if (IsMinusOne)
15828 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15829 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
15830 case AArch64CC::LE:
15831 if (IsZero)
15832 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15833 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
15834 case AArch64CC::LS:
15835 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
15836 case AArch64CC::LO:
15837 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
15838 case AArch64CC::LT:
15839 if (IsZero)
15840 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
15841 if (IsOne)
15842 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15843 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
15844 case AArch64CC::HI:
15845 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
15846 case AArch64CC::HS:
15847 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
15848 }
15849}
15850
15851SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15852 SelectionDAG &DAG) const {
15853 if (Op.getValueType().isScalableVector())
15854 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
15855
15856 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15857 !Subtarget->isNeonAvailable()))
15858 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15859
15860 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15861 SDValue LHS = Op.getOperand(0);
15862 SDValue RHS = Op.getOperand(1);
15863 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15864 SDLoc dl(Op);
15865
15866 if (LHS.getValueType().getVectorElementType().isInteger()) {
15867 assert(LHS.getValueType() == RHS.getValueType());
15869 SDValue Cmp =
15870 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
15871 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15872 }
15873
15874 // Lower isnan(x) | isnan(never-nan) to x != x.
15875 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15876 if (CC == ISD::SETUO || CC == ISD::SETO) {
15877 bool OneNaN = false;
15878 if (LHS == RHS) {
15879 OneNaN = true;
15880 } else if (DAG.isKnownNeverNaN(RHS)) {
15881 OneNaN = true;
15882 RHS = LHS;
15883 } else if (DAG.isKnownNeverNaN(LHS)) {
15884 OneNaN = true;
15885 LHS = RHS;
15886 }
15887 if (OneNaN) {
15889 }
15890 }
15891
15892 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15893
15894 // Make v4f16 (only) fcmp operations utilise vector instructions
15895 // v8f16 support will be a litle more complicated
15896 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15897 LHS.getValueType().getVectorElementType() == MVT::bf16) {
15898 if (LHS.getValueType().getVectorNumElements() == 4) {
15899 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
15900 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
15901 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
15902 DAG.ReplaceAllUsesWith(Op, NewSetcc);
15903 CmpVT = MVT::v4i32;
15904 } else
15905 return SDValue();
15906 }
15907
15908 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15909 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15910 LHS.getValueType().getVectorElementType() != MVT::f128);
15911
15912 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15913 // clean. Some of them require two branches to implement.
15914 AArch64CC::CondCode CC1, CC2;
15915 bool ShouldInvert;
15916 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
15917
15918 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15919 SDValue Cmp =
15920 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
15921 if (!Cmp.getNode())
15922 return SDValue();
15923
15924 if (CC2 != AArch64CC::AL) {
15925 SDValue Cmp2 =
15926 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
15927 if (!Cmp2.getNode())
15928 return SDValue();
15929
15930 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
15931 }
15932
15933 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15934
15935 if (ShouldInvert)
15936 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
15937
15938 return Cmp;
15939}
15940
15941static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15942 SelectionDAG &DAG) {
15943 SDValue VecOp = ScalarOp.getOperand(0);
15944 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
15945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
15946 DAG.getConstant(0, DL, MVT::i64));
15947}
15948
15949static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15950 SDLoc DL, SelectionDAG &DAG) {
15951 unsigned ScalarOpcode;
15952 switch (Opcode) {
15953 case ISD::VECREDUCE_AND:
15954 ScalarOpcode = ISD::AND;
15955 break;
15956 case ISD::VECREDUCE_OR:
15957 ScalarOpcode = ISD::OR;
15958 break;
15959 case ISD::VECREDUCE_XOR:
15960 ScalarOpcode = ISD::XOR;
15961 break;
15962 default:
15963 llvm_unreachable("Expected bitwise vector reduction");
15964 return SDValue();
15965 }
15966
15967 EVT VecVT = Vec.getValueType();
15968 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15969 "Expected power-of-2 length vector");
15970
15971 EVT ElemVT = VecVT.getVectorElementType();
15972
15973 SDValue Result;
15974 unsigned NumElems = VecVT.getVectorNumElements();
15975
15976 // Special case for boolean reductions
15977 if (ElemVT == MVT::i1) {
15978 // Split large vectors into smaller ones
15979 if (NumElems > 16) {
15980 SDValue Lo, Hi;
15981 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15982 EVT HalfVT = Lo.getValueType();
15983 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
15984 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
15985 }
15986
15987 // Results of setcc operations get widened to 128 bits if their input
15988 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
15989 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
15990 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
15991 // size leads to the best codegen, since e.g. setcc results might need to be
15992 // truncated otherwise.
15993 unsigned ExtendedWidth = 64;
15994 if (Vec.getOpcode() == ISD::SETCC &&
15995 Vec.getOperand(0).getValueSizeInBits() >= 128) {
15996 ExtendedWidth = 128;
15997 }
15998 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
15999
16000 // any_ext doesn't work with umin/umax, so only use it for uadd.
16001 unsigned ExtendOp =
16002 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16003 SDValue Extended = DAG.getNode(
16004 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16005 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16006 // in that case we bitcast the sign extended values from v2i64 to v4i32
16007 // before reduction for optimal code generation.
16008 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16009 NumElems == 2 && ExtendedWidth == 128) {
16010 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16011 ExtendedVT = MVT::i32;
16012 }
16013 switch (ScalarOpcode) {
16014 case ISD::AND:
16015 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16016 break;
16017 case ISD::OR:
16018 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16019 break;
16020 case ISD::XOR:
16021 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16022 break;
16023 default:
16024 llvm_unreachable("Unexpected Opcode");
16025 }
16026
16027 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16028 } else {
16029 // Iteratively split the vector in half and combine using the bitwise
16030 // operation until it fits in a 64 bit register.
16031 while (VecVT.getSizeInBits() > 64) {
16032 SDValue Lo, Hi;
16033 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16034 VecVT = Lo.getValueType();
16035 NumElems = VecVT.getVectorNumElements();
16036 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16037 }
16038
16039 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16040
16041 // Do the remaining work on a scalar since it allows the code generator to
16042 // combine the shift and bitwise operation into one instruction and since
16043 // integer instructions can have higher throughput than vector instructions.
16044 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16045
16046 // Iteratively combine the lower and upper halves of the scalar using the
16047 // bitwise operation, halving the relevant region of the scalar in each
16048 // iteration, until the relevant region is just one element of the original
16049 // vector.
16050 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16051 SDValue ShiftAmount =
16052 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16053 SDValue Shifted =
16054 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16055 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16056 }
16057
16058 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16059 }
16060
16061 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16062}
16063
16064SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16065 SelectionDAG &DAG) const {
16066 SDValue Src = Op.getOperand(0);
16067
16068 // Try to lower fixed length reductions to SVE.
16069 EVT SrcVT = Src.getValueType();
16070 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16071 Op.getOpcode() == ISD::VECREDUCE_AND ||
16072 Op.getOpcode() == ISD::VECREDUCE_OR ||
16073 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16074 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16075 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16076 SrcVT.getVectorElementType() == MVT::i64);
16077 if (SrcVT.isScalableVector() ||
16079 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16080
16081 if (SrcVT.getVectorElementType() == MVT::i1)
16082 return LowerPredReductionToSVE(Op, DAG);
16083
16084 switch (Op.getOpcode()) {
16085 case ISD::VECREDUCE_ADD:
16086 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16087 case ISD::VECREDUCE_AND:
16088 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16089 case ISD::VECREDUCE_OR:
16090 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16092 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16094 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16096 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16098 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16099 case ISD::VECREDUCE_XOR:
16100 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16102 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16104 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16106 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16108 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16110 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16111 default:
16112 llvm_unreachable("Unhandled fixed length reduction");
16113 }
16114 }
16115
16116 // Lower NEON reductions.
16117 SDLoc dl(Op);
16118 switch (Op.getOpcode()) {
16119 case ISD::VECREDUCE_AND:
16120 case ISD::VECREDUCE_OR:
16121 case ISD::VECREDUCE_XOR:
16122 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16123 Op.getValueType(), dl, DAG);
16124 case ISD::VECREDUCE_ADD:
16125 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
16127 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
16129 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
16131 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
16133 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
16134 default:
16135 llvm_unreachable("Unhandled reduction");
16136 }
16137}
16138
16139SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16140 SelectionDAG &DAG) const {
16141 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16142 // No point replacing if we don't have the relevant instruction/libcall anyway
16143 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16144 return SDValue();
16145
16146 // LSE has an atomic load-clear instruction, but not a load-and.
16147 SDLoc dl(Op);
16148 MVT VT = Op.getSimpleValueType();
16149 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16150 SDValue RHS = Op.getOperand(2);
16151 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16152 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getAllOnesConstant(dl, VT), RHS);
16153 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
16154 Op.getOperand(0), Op.getOperand(1), RHS,
16155 AN->getMemOperand());
16156}
16157
16158SDValue
16159AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16160 SelectionDAG &DAG) const {
16161
16162 SDLoc dl(Op);
16163 // Get the inputs.
16164 SDNode *Node = Op.getNode();
16165 SDValue Chain = Op.getOperand(0);
16166 SDValue Size = Op.getOperand(1);
16168 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16169 EVT VT = Node->getValueType(0);
16170
16172 "no-stack-arg-probe")) {
16173 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16174 Chain = SP.getValue(1);
16175 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16176 if (Align)
16177 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16178 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16179 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16180 SDValue Ops[2] = {SP, Chain};
16181 return DAG.getMergeValues(Ops, dl);
16182 }
16183
16184 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
16185
16186 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16188 PtrVT, 0);
16189
16190 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16191 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16192 if (Subtarget->hasCustomCallingConv())
16193 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16194
16195 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
16196 DAG.getConstant(4, dl, MVT::i64));
16197 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
16198 Chain =
16199 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
16200 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16201 DAG.getRegisterMask(Mask), Chain.getValue(1));
16202 // To match the actual intent better, we should read the output from X15 here
16203 // again (instead of potentially spilling it to the stack), but rereading Size
16204 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16205 // here.
16206
16207 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
16208 DAG.getConstant(4, dl, MVT::i64));
16209
16210 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16211 Chain = SP.getValue(1);
16212 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16213 if (Align)
16214 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16215 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16216 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16217
16218 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
16219
16220 SDValue Ops[2] = {SP, Chain};
16221 return DAG.getMergeValues(Ops, dl);
16222}
16223
16224SDValue
16225AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16226 SelectionDAG &DAG) const {
16227 // Get the inputs.
16228 SDNode *Node = Op.getNode();
16229 SDValue Chain = Op.getOperand(0);
16230 SDValue Size = Op.getOperand(1);
16231
16233 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16234 SDLoc dl(Op);
16235 EVT VT = Node->getValueType(0);
16236
16237 // Construct the new SP value in a GPR.
16238 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16239 Chain = SP.getValue(1);
16240 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16241 if (Align)
16242 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16243 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16244
16245 // Set the real SP to the new value with a probing loop.
16246 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
16247 SDValue Ops[2] = {SP, Chain};
16248 return DAG.getMergeValues(Ops, dl);
16249}
16250
16251SDValue
16252AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16253 SelectionDAG &DAG) const {
16255
16256 if (Subtarget->isTargetWindows())
16257 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16258 else if (hasInlineStackProbe(MF))
16259 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16260 else
16261 return SDValue();
16262}
16263
16264SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16265 unsigned NewOp) const {
16266 if (Subtarget->hasSVE2())
16267 return LowerToPredicatedOp(Op, DAG, NewOp);
16268
16269 // Default to expand.
16270 return SDValue();
16271}
16272
16273SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16274 SelectionDAG &DAG) const {
16275 EVT VT = Op.getValueType();
16276 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16277
16278 SDLoc DL(Op);
16279 APInt MulImm = Op.getConstantOperandAPInt(0);
16280 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16281 VT);
16282}
16283
16284/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16285template <unsigned NumVecs>
16286static bool
16290 // Retrieve EC from first vector argument.
16291 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16293#ifndef NDEBUG
16294 // Check the assumption that all input vectors are the same type.
16295 for (unsigned I = 0; I < NumVecs; ++I)
16296 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16297 "Invalid type.");
16298#endif
16299 // memVT is `NumVecs * VT`.
16301 EC * NumVecs);
16302 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16303 Info.offset = 0;
16304 Info.align.reset();
16306 return true;
16307}
16308
16309/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16310/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16311/// specified in the intrinsic calls.
16313 const CallInst &I,
16314 MachineFunction &MF,
16315 unsigned Intrinsic) const {
16316 auto &DL = I.getDataLayout();
16317 switch (Intrinsic) {
16318 case Intrinsic::aarch64_sve_st2:
16319 return setInfoSVEStN<2>(*this, DL, Info, I);
16320 case Intrinsic::aarch64_sve_st3:
16321 return setInfoSVEStN<3>(*this, DL, Info, I);
16322 case Intrinsic::aarch64_sve_st4:
16323 return setInfoSVEStN<4>(*this, DL, Info, I);
16324 case Intrinsic::aarch64_neon_ld2:
16325 case Intrinsic::aarch64_neon_ld3:
16326 case Intrinsic::aarch64_neon_ld4:
16327 case Intrinsic::aarch64_neon_ld1x2:
16328 case Intrinsic::aarch64_neon_ld1x3:
16329 case Intrinsic::aarch64_neon_ld1x4: {
16331 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16332 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16333 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16334 Info.offset = 0;
16335 Info.align.reset();
16336 // volatile loads with NEON intrinsics not supported
16338 return true;
16339 }
16340 case Intrinsic::aarch64_neon_ld2lane:
16341 case Intrinsic::aarch64_neon_ld3lane:
16342 case Intrinsic::aarch64_neon_ld4lane:
16343 case Intrinsic::aarch64_neon_ld2r:
16344 case Intrinsic::aarch64_neon_ld3r:
16345 case Intrinsic::aarch64_neon_ld4r: {
16347 // ldx return struct with the same vec type
16348 Type *RetTy = I.getType();
16349 auto *StructTy = cast<StructType>(RetTy);
16350 unsigned NumElts = StructTy->getNumElements();
16351 Type *VecTy = StructTy->getElementType(0);
16352 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16353 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16354 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16355 Info.offset = 0;
16356 Info.align.reset();
16357 // volatile loads with NEON intrinsics not supported
16359 return true;
16360 }
16361 case Intrinsic::aarch64_neon_st2:
16362 case Intrinsic::aarch64_neon_st3:
16363 case Intrinsic::aarch64_neon_st4:
16364 case Intrinsic::aarch64_neon_st1x2:
16365 case Intrinsic::aarch64_neon_st1x3:
16366 case Intrinsic::aarch64_neon_st1x4: {
16368 unsigned NumElts = 0;
16369 for (const Value *Arg : I.args()) {
16370 Type *ArgTy = Arg->getType();
16371 if (!ArgTy->isVectorTy())
16372 break;
16373 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16374 }
16375 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16376 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16377 Info.offset = 0;
16378 Info.align.reset();
16379 // volatile stores with NEON intrinsics not supported
16381 return true;
16382 }
16383 case Intrinsic::aarch64_neon_st2lane:
16384 case Intrinsic::aarch64_neon_st3lane:
16385 case Intrinsic::aarch64_neon_st4lane: {
16387 unsigned NumElts = 0;
16388 // all the vector type is same
16389 Type *VecTy = I.getArgOperand(0)->getType();
16390 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16391
16392 for (const Value *Arg : I.args()) {
16393 Type *ArgTy = Arg->getType();
16394 if (!ArgTy->isVectorTy())
16395 break;
16396 NumElts += 1;
16397 }
16398
16399 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16400 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16401 Info.offset = 0;
16402 Info.align.reset();
16403 // volatile stores with NEON intrinsics not supported
16405 return true;
16406 }
16407 case Intrinsic::aarch64_ldaxr:
16408 case Intrinsic::aarch64_ldxr: {
16409 Type *ValTy = I.getParamElementType(0);
16411 Info.memVT = MVT::getVT(ValTy);
16412 Info.ptrVal = I.getArgOperand(0);
16413 Info.offset = 0;
16414 Info.align = DL.getABITypeAlign(ValTy);
16416 return true;
16417 }
16418 case Intrinsic::aarch64_stlxr:
16419 case Intrinsic::aarch64_stxr: {
16420 Type *ValTy = I.getParamElementType(1);
16422 Info.memVT = MVT::getVT(ValTy);
16423 Info.ptrVal = I.getArgOperand(1);
16424 Info.offset = 0;
16425 Info.align = DL.getABITypeAlign(ValTy);
16427 return true;
16428 }
16429 case Intrinsic::aarch64_ldaxp:
16430 case Intrinsic::aarch64_ldxp:
16432 Info.memVT = MVT::i128;
16433 Info.ptrVal = I.getArgOperand(0);
16434 Info.offset = 0;
16435 Info.align = Align(16);
16437 return true;
16438 case Intrinsic::aarch64_stlxp:
16439 case Intrinsic::aarch64_stxp:
16441 Info.memVT = MVT::i128;
16442 Info.ptrVal = I.getArgOperand(2);
16443 Info.offset = 0;
16444 Info.align = Align(16);
16446 return true;
16447 case Intrinsic::aarch64_sve_ldnt1: {
16448 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16450 Info.memVT = MVT::getVT(I.getType());
16451 Info.ptrVal = I.getArgOperand(1);
16452 Info.offset = 0;
16453 Info.align = DL.getABITypeAlign(ElTy);
16455 return true;
16456 }
16457 case Intrinsic::aarch64_sve_stnt1: {
16458 Type *ElTy =
16459 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16461 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16462 Info.ptrVal = I.getArgOperand(2);
16463 Info.offset = 0;
16464 Info.align = DL.getABITypeAlign(ElTy);
16466 return true;
16467 }
16468 case Intrinsic::aarch64_mops_memset_tag: {
16469 Value *Dst = I.getArgOperand(0);
16470 Value *Val = I.getArgOperand(1);
16472 Info.memVT = MVT::getVT(Val->getType());
16473 Info.ptrVal = Dst;
16474 Info.offset = 0;
16475 Info.align = I.getParamAlign(0).valueOrOne();
16477 // The size of the memory being operated on is unknown at this point
16479 return true;
16480 }
16481 default:
16482 break;
16483 }
16484
16485 return false;
16486}
16487
16489 ISD::LoadExtType ExtTy,
16490 EVT NewVT) const {
16491 // TODO: This may be worth removing. Check regression tests for diffs.
16492 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
16493 return false;
16494
16495 // If we're reducing the load width in order to avoid having to use an extra
16496 // instruction to do extension then it's probably a good idea.
16497 if (ExtTy != ISD::NON_EXTLOAD)
16498 return true;
16499 // Don't reduce load width if it would prevent us from combining a shift into
16500 // the offset.
16501 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16502 assert(Mem);
16503 const SDValue &Base = Mem->getBasePtr();
16504 if (Base.getOpcode() == ISD::ADD &&
16505 Base.getOperand(1).getOpcode() == ISD::SHL &&
16506 Base.getOperand(1).hasOneUse() &&
16507 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16508 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16509 if (Mem->getMemoryVT().isScalableVector())
16510 return false;
16511 // The shift can be combined if it matches the size of the value being
16512 // loaded (and so reducing the width would make it not match).
16513 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16514 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16515 if (ShiftAmount == Log2_32(LoadBytes))
16516 return false;
16517 }
16518 // We have no reason to disallow reducing the load width, so allow it.
16519 return true;
16520}
16521
16522// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16524 EVT VT = Extend.getValueType();
16525 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16526 SDValue Extract = Extend.getOperand(0);
16527 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16528 Extract = Extract.getOperand(0);
16529 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16530 EVT VecVT = Extract.getOperand(0).getValueType();
16531 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16532 return false;
16533 }
16534 }
16535 return true;
16536}
16537
16538// Truncations from 64-bit GPR to 32-bit GPR is free.
16540 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16541 return false;
16542 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16543 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16544 return NumBits1 > NumBits2;
16545}
16547 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16548 return false;
16549 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16550 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16551 return NumBits1 > NumBits2;
16552}
16553
16554/// Check if it is profitable to hoist instruction in then/else to if.
16555/// Not profitable if I and it's user can form a FMA instruction
16556/// because we prefer FMSUB/FMADD.
16558 if (I->getOpcode() != Instruction::FMul)
16559 return true;
16560
16561 if (!I->hasOneUse())
16562 return true;
16563
16564 Instruction *User = I->user_back();
16565
16566 if (!(User->getOpcode() == Instruction::FSub ||
16567 User->getOpcode() == Instruction::FAdd))
16568 return true;
16569
16571 const Function *F = I->getFunction();
16572 const DataLayout &DL = F->getDataLayout();
16573 Type *Ty = User->getOperand(0)->getType();
16574
16575 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
16577 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16578 Options.UnsafeFPMath));
16579}
16580
16581// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16582// 64-bit GPR.
16584 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16585 return false;
16586 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16587 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16588 return NumBits1 == 32 && NumBits2 == 64;
16589}
16591 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16592 return false;
16593 unsigned NumBits1 = VT1.getSizeInBits();
16594 unsigned NumBits2 = VT2.getSizeInBits();
16595 return NumBits1 == 32 && NumBits2 == 64;
16596}
16597
16599 EVT VT1 = Val.getValueType();
16600 if (isZExtFree(VT1, VT2)) {
16601 return true;
16602 }
16603
16604 if (Val.getOpcode() != ISD::LOAD)
16605 return false;
16606
16607 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16608 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16609 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16610 VT1.getSizeInBits() <= 32);
16611}
16612
16613bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16614 if (isa<FPExtInst>(Ext))
16615 return false;
16616
16617 // Vector types are not free.
16618 if (Ext->getType()->isVectorTy())
16619 return false;
16620
16621 for (const Use &U : Ext->uses()) {
16622 // The extension is free if we can fold it with a left shift in an
16623 // addressing mode or an arithmetic operation: add, sub, and cmp.
16624
16625 // Is there a shift?
16626 const Instruction *Instr = cast<Instruction>(U.getUser());
16627
16628 // Is this a constant shift?
16629 switch (Instr->getOpcode()) {
16630 case Instruction::Shl:
16631 if (!isa<ConstantInt>(Instr->getOperand(1)))
16632 return false;
16633 break;
16634 case Instruction::GetElementPtr: {
16635 gep_type_iterator GTI = gep_type_begin(Instr);
16636 auto &DL = Ext->getDataLayout();
16637 std::advance(GTI, U.getOperandNo()-1);
16638 Type *IdxTy = GTI.getIndexedType();
16639 // This extension will end up with a shift because of the scaling factor.
16640 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16641 // Get the shift amount based on the scaling factor:
16642 // log2(sizeof(IdxTy)) - log2(8).
16643 if (IdxTy->isScalableTy())
16644 return false;
16645 uint64_t ShiftAmt =
16646 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
16647 3;
16648 // Is the constant foldable in the shift of the addressing mode?
16649 // I.e., shift amount is between 1 and 4 inclusive.
16650 if (ShiftAmt == 0 || ShiftAmt > 4)
16651 return false;
16652 break;
16653 }
16654 case Instruction::Trunc:
16655 // Check if this is a noop.
16656 // trunc(sext ty1 to ty2) to ty1.
16657 if (Instr->getType() == Ext->getOperand(0)->getType())
16658 continue;
16659 [[fallthrough]];
16660 default:
16661 return false;
16662 }
16663
16664 // At this point we can use the bfm family, so this extension is free
16665 // for that use.
16666 }
16667 return true;
16668}
16669
16670static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16671 unsigned NumElts, bool IsLittleEndian,
16672 SmallVectorImpl<int> &Mask) {
16673 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16674 return false;
16675
16676 assert(DstWidth % SrcWidth == 0 &&
16677 "TBL lowering is not supported for a conversion instruction with this "
16678 "source and destination element type.");
16679
16680 unsigned Factor = DstWidth / SrcWidth;
16681 unsigned MaskLen = NumElts * Factor;
16682
16683 Mask.clear();
16684 Mask.resize(MaskLen, NumElts);
16685
16686 unsigned SrcIndex = 0;
16687 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16688 Mask[I] = SrcIndex++;
16689
16690 return true;
16691}
16692
16694 FixedVectorType *ZExtTy,
16695 FixedVectorType *DstTy,
16696 bool IsLittleEndian) {
16697 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16698 unsigned NumElts = SrcTy->getNumElements();
16699 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16700 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16701
16702 SmallVector<int> Mask;
16703 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16704 return nullptr;
16705
16706 auto *FirstEltZero = Builder.CreateInsertElement(
16707 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16708 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16709 Result = Builder.CreateBitCast(Result, DstTy);
16710 if (DstTy != ZExtTy)
16711 Result = Builder.CreateZExt(Result, ZExtTy);
16712 return Result;
16713}
16714
16716 FixedVectorType *DstTy,
16717 bool IsLittleEndian) {
16718 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16719 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16720 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16721
16722 SmallVector<int> Mask;
16723 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
16724 !IsLittleEndian, Mask))
16725 return nullptr;
16726
16727 auto *FirstEltZero = Builder.CreateInsertElement(
16728 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16729
16730 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16731}
16732
16733static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16734 IRBuilder<> Builder(TI);
16736 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16737 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16738 auto *DstTy = cast<FixedVectorType>(TI->getType());
16739 assert(SrcTy->getElementType()->isIntegerTy() &&
16740 "Non-integer type source vector element is not supported");
16741 assert(DstTy->getElementType()->isIntegerTy(8) &&
16742 "Unsupported destination vector element type");
16743 unsigned SrcElemTySz =
16744 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16745 unsigned DstElemTySz =
16746 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16747 assert((SrcElemTySz % DstElemTySz == 0) &&
16748 "Cannot lower truncate to tbl instructions for a source element size "
16749 "that is not divisible by the destination element size");
16750 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16751 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16752 "Unsupported source vector element type size");
16753 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16754
16755 // Create a mask to choose every nth byte from the source vector table of
16756 // bytes to create the truncated destination vector, where 'n' is the truncate
16757 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16758 // 0,8,16,..Y*8th bytes for the little-endian format
16760 for (int Itr = 0; Itr < 16; Itr++) {
16761 if (Itr < NumElements)
16762 MaskConst.push_back(Builder.getInt8(
16763 IsLittleEndian ? Itr * TruncFactor
16764 : Itr * TruncFactor + (TruncFactor - 1)));
16765 else
16766 MaskConst.push_back(Builder.getInt8(255));
16767 }
16768
16769 int MaxTblSz = 128 * 4;
16770 int MaxSrcSz = SrcElemTySz * NumElements;
16771 int ElemsPerTbl =
16772 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16773 assert(ElemsPerTbl <= 16 &&
16774 "Maximum elements selected using TBL instruction cannot exceed 16!");
16775
16776 int ShuffleCount = 128 / SrcElemTySz;
16777 SmallVector<int> ShuffleLanes;
16778 for (int i = 0; i < ShuffleCount; ++i)
16779 ShuffleLanes.push_back(i);
16780
16781 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16782 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16783 // call TBL & save the result in a vector of TBL results for combining later.
16785 while (ShuffleLanes.back() < NumElements) {
16786 Parts.push_back(Builder.CreateBitCast(
16787 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16788
16789 if (Parts.size() == 4) {
16790 Parts.push_back(ConstantVector::get(MaskConst));
16791 Results.push_back(
16792 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
16793 Parts.clear();
16794 }
16795
16796 for (int i = 0; i < ShuffleCount; ++i)
16797 ShuffleLanes[i] += ShuffleCount;
16798 }
16799
16800 assert((Parts.empty() || Results.empty()) &&
16801 "Lowering trunc for vectors requiring different TBL instructions is "
16802 "not supported!");
16803 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16804 // registers
16805 if (!Parts.empty()) {
16806 Intrinsic::ID TblID;
16807 switch (Parts.size()) {
16808 case 1:
16809 TblID = Intrinsic::aarch64_neon_tbl1;
16810 break;
16811 case 2:
16812 TblID = Intrinsic::aarch64_neon_tbl2;
16813 break;
16814 case 3:
16815 TblID = Intrinsic::aarch64_neon_tbl3;
16816 break;
16817 }
16818
16819 Parts.push_back(ConstantVector::get(MaskConst));
16820 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
16821 }
16822
16823 // Extract the destination vector from TBL result(s) after combining them
16824 // where applicable. Currently, at most two TBLs are supported.
16825 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16826 "more than 2 tbl instructions!");
16827 Value *FinalResult = Results[0];
16828 if (Results.size() == 1) {
16829 if (ElemsPerTbl < 16) {
16830 SmallVector<int> FinalMask(ElemsPerTbl);
16831 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16832 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16833 }
16834 } else {
16835 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16836 if (ElemsPerTbl < 16) {
16837 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16838 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16839 } else {
16840 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16841 }
16842 FinalResult =
16843 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16844 }
16845
16846 TI->replaceAllUsesWith(FinalResult);
16847 TI->eraseFromParent();
16848}
16849
16851 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16852 // shuffle_vector instructions are serialized when targeting SVE,
16853 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16854 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16855 return false;
16856
16857 // Try to optimize conversions using tbl. This requires materializing constant
16858 // index vectors, which can increase code size and add loads. Skip the
16859 // transform unless the conversion is in a loop block guaranteed to execute
16860 // and we are not optimizing for size.
16861 Function *F = I->getParent()->getParent();
16862 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16863 F->hasOptSize())
16864 return false;
16865
16866 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16867 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16868 if (!SrcTy || !DstTy)
16869 return false;
16870
16871 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16872 // lowered to tbl instructions to insert the original i8 elements
16873 // into i8x lanes. This is enabled for cases where it is beneficial.
16874 auto *ZExt = dyn_cast<ZExtInst>(I);
16875 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16876 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16877 if (DstWidth % 8 != 0)
16878 return false;
16879
16880 auto *TruncDstType =
16881 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16882 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16883 // the remaining ZExt folded into the user, don't use tbl lowering.
16884 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16885 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16888 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16889 return false;
16890
16891 DstTy = TruncDstType;
16892 }
16893
16894 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16895 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
16896 // most one extra extend step is needed and using tbl is not profitable.
16897 if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
16898 auto *SingleUser = cast<Instruction>(*I->user_begin());
16899 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16900 return false;
16901 }
16902
16903 if (DstTy->getScalarSizeInBits() >= 64)
16904 return false;
16905
16906 IRBuilder<> Builder(ZExt);
16908 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16909 DstTy, Subtarget->isLittleEndian());
16910 if (!Result)
16911 return false;
16912 ZExt->replaceAllUsesWith(Result);
16913 ZExt->eraseFromParent();
16914 return true;
16915 }
16916
16917 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16918 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
16919 DstTy->getElementType()->isFloatTy()) ||
16920 (SrcTy->getElementType()->isIntegerTy(16) &&
16921 DstTy->getElementType()->isDoubleTy()))) {
16922 IRBuilder<> Builder(I);
16924 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16925 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16926 assert(ZExt && "Cannot fail for the i8 to float conversion");
16927 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16928 I->replaceAllUsesWith(UI);
16929 I->eraseFromParent();
16930 return true;
16931 }
16932
16933 auto *SIToFP = dyn_cast<SIToFPInst>(I);
16934 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16935 DstTy->getElementType()->isFloatTy()) {
16936 IRBuilder<> Builder(I);
16937 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16939 Subtarget->isLittleEndian());
16940 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16941 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16942 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16943 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16944 I->replaceAllUsesWith(SI);
16945 I->eraseFromParent();
16946 return true;
16947 }
16948
16949 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16950 // followed by a truncate lowered to using tbl.4.
16951 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16952 if (FPToUI &&
16953 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16954 SrcTy->getElementType()->isFloatTy() &&
16955 DstTy->getElementType()->isIntegerTy(8)) {
16956 IRBuilder<> Builder(I);
16957 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16958 VectorType::getInteger(SrcTy));
16959 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16960 I->replaceAllUsesWith(TruncI);
16961 I->eraseFromParent();
16962 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16963 return true;
16964 }
16965
16966 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16967 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16968 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16969 // registers
16970 auto *TI = dyn_cast<TruncInst>(I);
16971 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16972 ((SrcTy->getElementType()->isIntegerTy(32) ||
16973 SrcTy->getElementType()->isIntegerTy(64)) &&
16974 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16975 createTblForTrunc(TI, Subtarget->isLittleEndian());
16976 return true;
16977 }
16978
16979 return false;
16980}
16981
16983 Align &RequiredAligment) const {
16984 if (!LoadedType.isSimple() ||
16985 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16986 return false;
16987 // Cyclone supports unaligned accesses.
16988 RequiredAligment = Align(1);
16989 unsigned NumBits = LoadedType.getSizeInBits();
16990 return NumBits == 32 || NumBits == 64;
16991}
16992
16993/// A helper function for determining the number of interleaved accesses we
16994/// will generate when lowering accesses of the given type.
16996 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16997 unsigned VecSize = 128;
16998 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16999 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17000 if (UseScalable && isa<FixedVectorType>(VecTy))
17001 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17002 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17003}
17004
17007 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17008 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17009 return MOStridedAccess;
17011}
17012
17014 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17015 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17016 auto EC = VecTy->getElementCount();
17017 unsigned MinElts = EC.getKnownMinValue();
17018
17019 UseScalable = false;
17020
17021 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17022 (!Subtarget->useSVEForFixedLengthVectors() ||
17024 return false;
17025
17026 if (isa<ScalableVectorType>(VecTy) &&
17027 !Subtarget->isSVEorStreamingSVEAvailable())
17028 return false;
17029
17030 // Ensure the number of vector elements is greater than 1.
17031 if (MinElts < 2)
17032 return false;
17033
17034 // Ensure the element type is legal.
17035 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17036 return false;
17037
17038 if (EC.isScalable()) {
17039 UseScalable = true;
17040 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17041 }
17042
17043 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17044 if (Subtarget->useSVEForFixedLengthVectors()) {
17045 unsigned MinSVEVectorSize =
17046 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17047 if (VecSize % MinSVEVectorSize == 0 ||
17048 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17049 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17050 UseScalable = true;
17051 return true;
17052 }
17053 }
17054
17055 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17056 // 128 will be split into multiple interleaved accesses.
17057 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17058}
17059
17061 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17062 return ScalableVectorType::get(VTy->getElementType(), 2);
17063
17064 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17065 return ScalableVectorType::get(VTy->getElementType(), 4);
17066
17067 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17068 return ScalableVectorType::get(VTy->getElementType(), 8);
17069
17070 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17071 return ScalableVectorType::get(VTy->getElementType(), 8);
17072
17073 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17074 return ScalableVectorType::get(VTy->getElementType(), 2);
17075
17076 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17077 return ScalableVectorType::get(VTy->getElementType(), 4);
17078
17079 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17080 return ScalableVectorType::get(VTy->getElementType(), 8);
17081
17082 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17083 return ScalableVectorType::get(VTy->getElementType(), 16);
17084
17085 llvm_unreachable("Cannot handle input vector type");
17086}
17087
17088static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17089 bool Scalable, Type *LDVTy,
17090 Type *PtrTy) {
17091 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17092 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17093 Intrinsic::aarch64_sve_ld3_sret,
17094 Intrinsic::aarch64_sve_ld4_sret};
17095 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17096 Intrinsic::aarch64_neon_ld3,
17097 Intrinsic::aarch64_neon_ld4};
17098 if (Scalable)
17099 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17100
17101 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17102 {LDVTy, PtrTy});
17103}
17104
17105static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17106 bool Scalable, Type *STVTy,
17107 Type *PtrTy) {
17108 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17109 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17110 Intrinsic::aarch64_sve_st3,
17111 Intrinsic::aarch64_sve_st4};
17112 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17113 Intrinsic::aarch64_neon_st3,
17114 Intrinsic::aarch64_neon_st4};
17115 if (Scalable)
17116 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17117
17118 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17119 {STVTy, PtrTy});
17120}
17121
17122/// Lower an interleaved load into a ldN intrinsic.
17123///
17124/// E.g. Lower an interleaved load (Factor = 2):
17125/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17126/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17127/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17128///
17129/// Into:
17130/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17131/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17132/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17135 ArrayRef<unsigned> Indices, unsigned Factor) const {
17136 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17137 "Invalid interleave factor");
17138 assert(!Shuffles.empty() && "Empty shufflevector input");
17139 assert(Shuffles.size() == Indices.size() &&
17140 "Unmatched number of shufflevectors and indices");
17141
17142 const DataLayout &DL = LI->getDataLayout();
17143
17144 VectorType *VTy = Shuffles[0]->getType();
17145
17146 // Skip if we do not have NEON and skip illegal vector types. We can
17147 // "legalize" wide vector types into multiple interleaved accesses as long as
17148 // the vector types are divisible by 128.
17149 bool UseScalable;
17150 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17151 return false;
17152
17153 // Check if the interleave is a zext(shuffle), that can be better optimized
17154 // into shift / and masks. For the moment we do this just for uitofp (not
17155 // zext) to avoid issues with widening instructions.
17156 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17157 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17158 SI->getType()->getScalarSizeInBits() * 4 ==
17159 SI->user_back()->getType()->getScalarSizeInBits();
17160 }))
17161 return false;
17162
17163 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17164
17165 auto *FVTy = cast<FixedVectorType>(VTy);
17166
17167 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17168 // load integer vectors first and then convert to pointer vectors.
17169 Type *EltTy = FVTy->getElementType();
17170 if (EltTy->isPointerTy())
17171 FVTy =
17172 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17173
17174 // If we're going to generate more than one load, reset the sub-vector type
17175 // to something legal.
17176 FVTy = FixedVectorType::get(FVTy->getElementType(),
17177 FVTy->getNumElements() / NumLoads);
17178
17179 auto *LDVTy =
17180 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17181
17182 IRBuilder<> Builder(LI);
17183
17184 // The base address of the load.
17185 Value *BaseAddr = LI->getPointerOperand();
17186
17187 Type *PtrTy = LI->getPointerOperandType();
17188 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17189 LDVTy->getElementCount());
17190
17191 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17192 UseScalable, LDVTy, PtrTy);
17193
17194 // Holds sub-vectors extracted from the load intrinsic return values. The
17195 // sub-vectors are associated with the shufflevector instructions they will
17196 // replace.
17198
17199 Value *PTrue = nullptr;
17200 if (UseScalable) {
17201 std::optional<unsigned> PgPattern =
17202 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17203 if (Subtarget->getMinSVEVectorSizeInBits() ==
17204 Subtarget->getMaxSVEVectorSizeInBits() &&
17205 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17206 PgPattern = AArch64SVEPredPattern::all;
17207
17208 auto *PTruePat =
17209 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17210 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17211 {PTruePat});
17212 }
17213
17214 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17215
17216 // If we're generating more than one load, compute the base address of
17217 // subsequent loads as an offset from the previous.
17218 if (LoadCount > 0)
17219 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17220 FVTy->getNumElements() * Factor);
17221
17222 CallInst *LdN;
17223 if (UseScalable)
17224 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17225 else
17226 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17227
17228 // Extract and store the sub-vectors returned by the load intrinsic.
17229 for (unsigned i = 0; i < Shuffles.size(); i++) {
17230 ShuffleVectorInst *SVI = Shuffles[i];
17231 unsigned Index = Indices[i];
17232
17233 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17234
17235 if (UseScalable)
17236 SubVec = Builder.CreateExtractVector(
17237 FVTy, SubVec,
17238 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
17239
17240 // Convert the integer vector to pointer vector if the element is pointer.
17241 if (EltTy->isPointerTy())
17242 SubVec = Builder.CreateIntToPtr(
17244 FVTy->getNumElements()));
17245
17246 SubVecs[SVI].push_back(SubVec);
17247 }
17248 }
17249
17250 // Replace uses of the shufflevector instructions with the sub-vectors
17251 // returned by the load intrinsic. If a shufflevector instruction is
17252 // associated with more than one sub-vector, those sub-vectors will be
17253 // concatenated into a single wide vector.
17254 for (ShuffleVectorInst *SVI : Shuffles) {
17255 auto &SubVec = SubVecs[SVI];
17256 auto *WideVec =
17257 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17258 SVI->replaceAllUsesWith(WideVec);
17259 }
17260
17261 return true;
17262}
17263
17264template <typename Iter>
17265bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17266 int MaxLookupDist = 20;
17267 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17268 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17269 const Value *PtrA1 =
17270 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17271
17272 while (++It != End) {
17273 if (It->isDebugOrPseudoInst())
17274 continue;
17275 if (MaxLookupDist-- == 0)
17276 break;
17277 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17278 const Value *PtrB1 =
17279 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17280 DL, OffsetB);
17281 if (PtrA1 == PtrB1 &&
17282 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17283 .abs() == 16)
17284 return true;
17285 }
17286 }
17287
17288 return false;
17289}
17290
17291/// Lower an interleaved store into a stN intrinsic.
17292///
17293/// E.g. Lower an interleaved store (Factor = 3):
17294/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17295/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17296/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17297///
17298/// Into:
17299/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17300/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17301/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17302/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17303///
17304/// Note that the new shufflevectors will be removed and we'll only generate one
17305/// st3 instruction in CodeGen.
17306///
17307/// Example for a more general valid mask (Factor 3). Lower:
17308/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17309/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17310/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17311///
17312/// Into:
17313/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17314/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17315/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17316/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17318 ShuffleVectorInst *SVI,
17319 unsigned Factor) const {
17320
17321 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17322 "Invalid interleave factor");
17323
17324 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17325 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17326
17327 unsigned LaneLen = VecTy->getNumElements() / Factor;
17328 Type *EltTy = VecTy->getElementType();
17329 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17330
17331 const DataLayout &DL = SI->getDataLayout();
17332 bool UseScalable;
17333
17334 // Skip if we do not have NEON and skip illegal vector types. We can
17335 // "legalize" wide vector types into multiple interleaved accesses as long as
17336 // the vector types are divisible by 128.
17337 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17338 return false;
17339
17340 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17341
17342 Value *Op0 = SVI->getOperand(0);
17343 Value *Op1 = SVI->getOperand(1);
17344 IRBuilder<> Builder(SI);
17345
17346 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17347 // vectors to integer vectors.
17348 if (EltTy->isPointerTy()) {
17349 Type *IntTy = DL.getIntPtrType(EltTy);
17350 unsigned NumOpElts =
17351 cast<FixedVectorType>(Op0->getType())->getNumElements();
17352
17353 // Convert to the corresponding integer vector.
17354 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17355 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17356 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17357
17358 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17359 }
17360
17361 // If we're going to generate more than one store, reset the lane length
17362 // and sub-vector type to something legal.
17363 LaneLen /= NumStores;
17364 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17365
17366 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17367 : SubVecTy;
17368
17369 // The base address of the store.
17370 Value *BaseAddr = SI->getPointerOperand();
17371
17372 auto Mask = SVI->getShuffleMask();
17373
17374 // Sanity check if all the indices are NOT in range.
17375 // If mask is `poison`, `Mask` may be a vector of -1s.
17376 // If all of them are `poison`, OOB read will happen later.
17377 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17378 return false;
17379 }
17380 // A 64bit st2 which does not start at element 0 will involved adding extra
17381 // ext elements making the st2 unprofitable, and if there is a nearby store
17382 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17383 // zip;ldp pair which has higher throughput.
17384 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17385 (Mask[0] != 0 ||
17386 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17387 DL) ||
17388 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17389 BaseAddr, DL)))
17390 return false;
17391
17392 Type *PtrTy = SI->getPointerOperandType();
17393 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17394 STVTy->getElementCount());
17395
17396 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17397 UseScalable, STVTy, PtrTy);
17398
17399 Value *PTrue = nullptr;
17400 if (UseScalable) {
17401 std::optional<unsigned> PgPattern =
17402 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17403 if (Subtarget->getMinSVEVectorSizeInBits() ==
17404 Subtarget->getMaxSVEVectorSizeInBits() &&
17405 Subtarget->getMinSVEVectorSizeInBits() ==
17406 DL.getTypeSizeInBits(SubVecTy))
17407 PgPattern = AArch64SVEPredPattern::all;
17408
17409 auto *PTruePat =
17410 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17411 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17412 {PTruePat});
17413 }
17414
17415 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17416
17418
17419 // Split the shufflevector operands into sub vectors for the new stN call.
17420 for (unsigned i = 0; i < Factor; i++) {
17421 Value *Shuffle;
17422 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17423 if (Mask[IdxI] >= 0) {
17424 Shuffle = Builder.CreateShuffleVector(
17425 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17426 } else {
17427 unsigned StartMask = 0;
17428 for (unsigned j = 1; j < LaneLen; j++) {
17429 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17430 if (Mask[IdxJ] >= 0) {
17431 StartMask = Mask[IdxJ] - j;
17432 break;
17433 }
17434 }
17435 // Note: Filling undef gaps with random elements is ok, since
17436 // those elements were being written anyway (with undefs).
17437 // In the case of all undefs we're defaulting to using elems from 0
17438 // Note: StartMask cannot be negative, it's checked in
17439 // isReInterleaveMask
17440 Shuffle = Builder.CreateShuffleVector(
17441 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17442 }
17443
17444 if (UseScalable)
17445 Shuffle = Builder.CreateInsertVector(
17446 STVTy, UndefValue::get(STVTy), Shuffle,
17447 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
17448
17449 Ops.push_back(Shuffle);
17450 }
17451
17452 if (UseScalable)
17453 Ops.push_back(PTrue);
17454
17455 // If we generating more than one store, we compute the base address of
17456 // subsequent stores as an offset from the previous.
17457 if (StoreCount > 0)
17458 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17459 BaseAddr, LaneLen * Factor);
17460
17461 Ops.push_back(BaseAddr);
17462 Builder.CreateCall(StNFunc, Ops);
17463 }
17464 return true;
17465}
17466
17468 Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
17469 SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
17470 if (!DI->hasNUses(2))
17471 return false;
17472 auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
17473 auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
17474 if (!Extr1 || !Extr2)
17475 return false;
17476
17477 DeinterleavedValues.resize(2);
17478 // Place the values into the vector in the order of extraction:
17479 DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1;
17480 DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2;
17481 if (!DeinterleavedValues[0] || !DeinterleavedValues[1])
17482 return false;
17483
17484 // Make sure that the extracted values match the deinterleave tree pattern
17485 if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) ||
17486 !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) {
17487 LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
17488 return false;
17489 }
17490 // DeinterleavedValues will be replace by output of ld2
17491 DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
17492 DeinterleavedValues.begin(),
17493 DeinterleavedValues.end());
17494 return true;
17495}
17496
17497/*
17498DeinterleaveIntrinsic tree:
17499 [DI]
17500 / \
17501 [Extr<0>] [Extr<1>]
17502 | |
17503 [DI] [DI]
17504 / \ / \
17505 [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
17506 | | | |
17507roots: A C B D
17508roots in correct order of DI4 will be: A B C D.
17509Returns true if `DI` is the top of an IR tree that represents a theoretical
17510vector.deinterleave4 intrinsic. When true is returned, \p `DeinterleavedValues`
17511vector is populated with the results such an intrinsic would return: (i.e. {A,
17512B, C, D } = vector.deinterleave4(...))
17513*/
17515 Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
17516 SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
17517 if (!DI->hasNUses(2))
17518 return false;
17519 auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
17520 auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
17521 if (!Extr1 || !Extr2)
17522 return false;
17523
17524 if (!Extr1->hasOneUse() || !Extr2->hasOneUse())
17525 return false;
17526 auto *DI1 = *(Extr1->user_begin());
17527 auto *DI2 = *(Extr2->user_begin());
17528
17529 if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
17530 return false;
17531 // Leaf nodes of the deinterleave tree:
17532 auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
17533 auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
17534 auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
17535 auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
17536 // Make sure that the A,B,C and D are ExtractValue instructions before getting
17537 // the extract index
17538 if (!A || !B || !C || !D)
17539 return false;
17540
17541 DeinterleavedValues.resize(4);
17542 // Place the values into the vector in the order of deinterleave4:
17543 DeinterleavedValues[0x3 &
17544 ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A;
17545 DeinterleavedValues[0x3 &
17546 ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B;
17547 DeinterleavedValues[0x3 &
17548 ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C;
17549 DeinterleavedValues[0x3 &
17550 ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D;
17551 if (!DeinterleavedValues[0] || !DeinterleavedValues[1] ||
17552 !DeinterleavedValues[2] || !DeinterleavedValues[3])
17553 return false;
17554
17555 // Make sure that A,B,C,D match the deinterleave tree pattern
17556 if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
17557 m_ExtractValue<0>(m_Specific(DI))))) ||
17558 !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2(
17559 m_ExtractValue<1>(m_Specific(DI))))) ||
17560 !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2(
17561 m_ExtractValue<0>(m_Specific(DI))))) ||
17562 !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2(
17563 m_ExtractValue<1>(m_Specific(DI)))))) {
17564 LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
17565 return false;
17566 }
17567
17568 // These Values will not be used anymore,
17569 // DI4 will be created instead of nested DI1 and DI2
17570 DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
17571 DeinterleavedValues.begin(),
17572 DeinterleavedValues.end());
17573 DeInterleaveDeadInsts.push_back(cast<Instruction>(DI1));
17574 DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr1));
17575 DeInterleaveDeadInsts.push_back(cast<Instruction>(DI2));
17576 DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr2));
17577
17578 return true;
17579}
17580
17582 Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
17583 SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
17584 if (getDeinterleave4Values(DI, DeinterleavedValues, DeInterleaveDeadInsts))
17585 return true;
17586 return getDeinterleave2Values(DI, DeinterleavedValues, DeInterleaveDeadInsts);
17587}
17588
17590 IntrinsicInst *DI, LoadInst *LI,
17591 SmallVectorImpl<Instruction *> &DeadInsts) const {
17592 // Only deinterleave2 supported at present.
17593 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
17594 return false;
17595
17596 SmallVector<Instruction *, 4> DeinterleavedValues;
17597 SmallVector<Instruction *, 8> DeInterleaveDeadInsts;
17598
17599 if (!getDeinterleavedValues(DI, DeinterleavedValues, DeInterleaveDeadInsts)) {
17600 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17601 return false;
17602 }
17603 unsigned Factor = DeinterleavedValues.size();
17604 assert((Factor == 2 || Factor == 4) &&
17605 "Currently supported Factor is 2 or 4 only");
17606 VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17607
17608 const DataLayout &DL = DI->getModule()->getDataLayout();
17609 bool UseScalable;
17610 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17611 return false;
17612
17613 // TODO: Add support for using SVE instructions with fixed types later, using
17614 // the code from lowerInterleavedLoad to obtain the correct container type.
17615 if (UseScalable && !VTy->isScalableTy())
17616 return false;
17617
17618 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17619 VectorType *LdTy =
17621 VTy->getElementCount().divideCoefficientBy(NumLoads));
17622
17623 Type *PtrTy = LI->getPointerOperandType();
17624 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
17625 UseScalable, LdTy, PtrTy);
17626
17627 IRBuilder<> Builder(LI);
17628 Value *Pred = nullptr;
17629 if (UseScalable)
17630 Pred =
17631 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17632
17633 Value *BaseAddr = LI->getPointerOperand();
17634 if (NumLoads > 1) {
17635 // Create multiple legal small ldN.
17636 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17637 for (unsigned I = 0; I < NumLoads; ++I) {
17638 Value *Offset = Builder.getInt64(I * Factor);
17639
17640 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17641 Value *LdN = nullptr;
17642 if (UseScalable)
17643 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17644 else
17645 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17646 Value *Idx =
17647 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17648 for (unsigned J = 0; J < Factor; ++J) {
17649 ExtractedLdValues[J] = Builder.CreateInsertVector(
17650 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17651 }
17652 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17653 }
17654 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17655 for (unsigned J = 0; J < Factor; ++J)
17656 DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17657 } else {
17658 Value *Result;
17659 if (UseScalable)
17660 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17661 else
17662 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17663 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17664 for (unsigned I = 0; I < Factor; I++) {
17665 Value *NewExtract = Builder.CreateExtractValue(Result, I);
17666 DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17667 }
17668 }
17669 DeadInsts.insert(DeadInsts.end(), DeInterleaveDeadInsts.begin(),
17670 DeInterleaveDeadInsts.end());
17671 return true;
17672}
17673
17674/*
17675InterleaveIntrinsic tree.
17676 A C B D
17677 \ / \ /
17678 [II] [II]
17679 \ /
17680 [II]
17681
17682values in correct order of interleave4: A B C D.
17683Returns true if `II` is the root of an IR tree that represents a theoretical
17684vector.interleave4 intrinsic. When true is returned, \p `InterleavedValues`
17685vector is populated with the inputs such an intrinsic would take: (i.e.
17686vector.interleave4(A, B, C, D)).
17687*/
17689 Value *II, SmallVectorImpl<Value *> &InterleavedValues,
17690 SmallVectorImpl<Instruction *> &InterleaveDeadInsts) {
17691 Value *A, *B, *C, *D;
17692 // Try to match interleave of Factor 4
17695 InterleavedValues.push_back(A);
17696 InterleavedValues.push_back(B);
17697 InterleavedValues.push_back(C);
17698 InterleavedValues.push_back(D);
17699 // intermediate II will not be needed anymore
17700 InterleaveDeadInsts.push_back(
17701 cast<Instruction>(cast<Instruction>(II)->getOperand(0)));
17702 InterleaveDeadInsts.push_back(
17703 cast<Instruction>(cast<Instruction>(II)->getOperand(1)));
17704 return true;
17705 }
17706
17707 // Try to match interleave of Factor 2
17708 if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) {
17709 InterleavedValues.push_back(A);
17710 InterleavedValues.push_back(B);
17711 return true;
17712 }
17713
17714 return false;
17715}
17716
17719 SmallVectorImpl<Instruction *> &DeadInsts) const {
17720 // Only interleave2 supported at present.
17721 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17722 return false;
17723
17724 SmallVector<Value *, 4> InterleavedValues;
17725 SmallVector<Instruction *, 2> InterleaveDeadInsts;
17726 if (!getValuesToInterleave(II, InterleavedValues, InterleaveDeadInsts)) {
17727 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17728 return false;
17729 }
17730 unsigned Factor = InterleavedValues.size();
17731 assert((Factor == 2 || Factor == 4) &&
17732 "Currently supported Factor is 2 or 4 only");
17733 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
17734 const DataLayout &DL = II->getModule()->getDataLayout();
17735
17736 bool UseScalable;
17737 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17738 return false;
17739
17740 // TODO: Add support for using SVE instructions with fixed types later, using
17741 // the code from lowerInterleavedStore to obtain the correct container type.
17742 if (UseScalable && !VTy->isScalableTy())
17743 return false;
17744
17745 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17746
17747 VectorType *StTy =
17749 VTy->getElementCount().divideCoefficientBy(NumStores));
17750
17751 Type *PtrTy = SI->getPointerOperandType();
17752 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17753 UseScalable, StTy, PtrTy);
17754
17755 IRBuilder<> Builder(SI);
17756
17757 Value *BaseAddr = SI->getPointerOperand();
17758 Value *Pred = nullptr;
17759
17760 if (UseScalable)
17761 Pred =
17762 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17763
17764 auto ExtractedValues = InterleavedValues;
17765 if (UseScalable)
17766 InterleavedValues.push_back(Pred);
17767 InterleavedValues.push_back(BaseAddr);
17768 for (unsigned I = 0; I < NumStores; ++I) {
17769 Value *Address = BaseAddr;
17770 if (NumStores > 1) {
17771 Value *Offset = Builder.getInt64(I * Factor);
17772 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17773 Value *Idx =
17774 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17775 for (unsigned J = 0; J < Factor; J++) {
17776 InterleavedValues[J] =
17777 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
17778 }
17779 // update the address
17780 InterleavedValues[InterleavedValues.size() - 1] = Address;
17781 }
17782 Builder.CreateCall(StNFunc, InterleavedValues);
17783 }
17784 DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
17785 InterleaveDeadInsts.end());
17786 return true;
17787}
17788
17790 const MemOp &Op, const AttributeList &FuncAttributes) const {
17791 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17792 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17793 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17794 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17795 // taken one instruction to materialize the v2i64 zero and one store (with
17796 // restrictive addressing mode). Just do i64 stores.
17797 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17798 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17799 if (Op.isAligned(AlignCheck))
17800 return true;
17801 unsigned Fast;
17802 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17804 Fast;
17805 };
17806
17807 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17808 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17809 return MVT::v16i8;
17810 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17811 return MVT::f128;
17812 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17813 return MVT::i64;
17814 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17815 return MVT::i32;
17816 return MVT::Other;
17817}
17818
17820 const MemOp &Op, const AttributeList &FuncAttributes) const {
17821 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17822 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17823 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17824 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17825 // taken one instruction to materialize the v2i64 zero and one store (with
17826 // restrictive addressing mode). Just do i64 stores.
17827 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17828 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17829 if (Op.isAligned(AlignCheck))
17830 return true;
17831 unsigned Fast;
17832 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17834 Fast;
17835 };
17836
17837 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17838 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17839 return LLT::fixed_vector(2, 64);
17840 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17841 return LLT::scalar(128);
17842 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17843 return LLT::scalar(64);
17844 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17845 return LLT::scalar(32);
17846 return LLT();
17847}
17848
17849// 12-bit optionally shifted immediates are legal for adds.
17851 if (Immed == std::numeric_limits<int64_t>::min()) {
17852 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17853 << ": avoid UB for INT64_MIN\n");
17854 return false;
17855 }
17856 // Same encoding for add/sub, just flip the sign.
17857 Immed = std::abs(Immed);
17858 bool IsLegal = ((Immed >> 12) == 0 ||
17859 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17860 LLVM_DEBUG(dbgs() << "Is " << Immed
17861 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17862 return IsLegal;
17863}
17864
17866 // We will only emit addvl/inc* instructions for SVE2
17867 if (!Subtarget->hasSVE2())
17868 return false;
17869
17870 // addvl's immediates are in terms of the number of bytes in a register.
17871 // Since there are 16 in the base supported size (128bits), we need to
17872 // divide the immediate by that much to give us a useful immediate to
17873 // multiply by vscale. We can't have a remainder as a result of this.
17874 if (Imm % 16 == 0)
17875 return isInt<6>(Imm / 16);
17876
17877 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17878 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17879 // of addvl as a result, so only take h|w|d into account.
17880 // Dec[h|w|d] will cover subtractions.
17881 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17882 // FIXME: Can we make use of other patterns to cover other immediates?
17883
17884 // inch|dech
17885 if (Imm % 8 == 0)
17886 return std::abs(Imm / 8) <= 16;
17887 // incw|decw
17888 if (Imm % 4 == 0)
17889 return std::abs(Imm / 4) <= 16;
17890 // incd|decd
17891 if (Imm % 2 == 0)
17892 return std::abs(Imm / 2) <= 16;
17893
17894 return false;
17895}
17896
17897// Return false to prevent folding
17898// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17899// if the folding leads to worse code.
17901 SDValue AddNode, SDValue ConstNode) const {
17902 // Let the DAGCombiner decide for vector types and large types.
17903 const EVT VT = AddNode.getValueType();
17904 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17905 return true;
17906
17907 // It is worse if c1 is legal add immediate, while c1*c2 is not
17908 // and has to be composed by at least two instructions.
17909 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17910 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17911 const int64_t C1 = C1Node->getSExtValue();
17912 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17914 return true;
17916 // Adapt to the width of a register.
17917 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17919 if (Insn.size() > 1)
17920 return false;
17921
17922 // Default to true and let the DAGCombiner decide.
17923 return true;
17924}
17925
17926// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17927// immediates is the same as for an add or a sub.
17929 return isLegalAddImmediate(Immed);
17930}
17931
17932/// isLegalAddressingMode - Return true if the addressing mode represented
17933/// by AM is legal for this target, for a load/store of the specified type.
17935 const AddrMode &AMode, Type *Ty,
17936 unsigned AS, Instruction *I) const {
17937 // AArch64 has five basic addressing modes:
17938 // reg
17939 // reg + 9-bit signed offset
17940 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17941 // reg1 + reg2
17942 // reg + SIZE_IN_BYTES * reg
17943
17944 // No global is ever allowed as a base.
17945 if (AMode.BaseGV)
17946 return false;
17947
17948 // No reg+reg+imm addressing.
17949 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17950 return false;
17951
17952 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17953 // `2*ScaledReg` into `BaseReg + ScaledReg`
17954 AddrMode AM = AMode;
17955 if (AM.Scale && !AM.HasBaseReg) {
17956 if (AM.Scale == 1) {
17957 AM.HasBaseReg = true;
17958 AM.Scale = 0;
17959 } else if (AM.Scale == 2) {
17960 AM.HasBaseReg = true;
17961 AM.Scale = 1;
17962 } else {
17963 return false;
17964 }
17965 }
17966
17967 // A base register is required in all addressing modes.
17968 if (!AM.HasBaseReg)
17969 return false;
17970
17971 if (Ty->isScalableTy()) {
17972 if (isa<ScalableVectorType>(Ty)) {
17973 // See if we have a foldable vscale-based offset, for vector types which
17974 // are either legal or smaller than the minimum; more work will be
17975 // required if we need to consider addressing for types which need
17976 // legalization by splitting.
17977 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17978 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17979 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17980 isPowerOf2_64(VecNumBytes))
17981 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17982
17983 uint64_t VecElemNumBytes =
17984 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17985 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17986 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17987 }
17988
17989 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17990 }
17991
17992 // No scalable offsets allowed for non-scalable types.
17993 if (AM.ScalableOffset)
17994 return false;
17995
17996 // check reg + imm case:
17997 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17998 uint64_t NumBytes = 0;
17999 if (Ty->isSized()) {
18000 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18001 NumBytes = NumBits / 8;
18002 if (!isPowerOf2_64(NumBits))
18003 NumBytes = 0;
18004 }
18005
18006 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18007 AM.Scale);
18008}
18009
18010// Check whether the 2 offsets belong to the same imm24 range, and their high
18011// 12bits are same, then their high part can be decoded with the offset of add.
18012int64_t
18014 int64_t MaxOffset) const {
18015 int64_t HighPart = MinOffset & ~0xfffULL;
18016 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18017 // Rebase the value to an integer multiple of imm12.
18018 return HighPart;
18019 }
18020
18021 return 0;
18022}
18023
18025 // Consider splitting large offset of struct or array.
18026 return true;
18027}
18028
18030 const MachineFunction &MF, EVT VT) const {
18031 VT = VT.getScalarType();
18032
18033 if (!VT.isSimple())
18034 return false;
18035
18036 switch (VT.getSimpleVT().SimpleTy) {
18037 case MVT::f16:
18038 return Subtarget->hasFullFP16();
18039 case MVT::f32:
18040 case MVT::f64:
18041 return true;
18042 default:
18043 break;
18044 }
18045
18046 return false;
18047}
18048
18050 Type *Ty) const {
18051 switch (Ty->getScalarType()->getTypeID()) {
18052 case Type::FloatTyID:
18053 case Type::DoubleTyID:
18054 return true;
18055 default:
18056 return false;
18057 }
18058}
18059
18061 EVT VT, CodeGenOptLevel OptLevel) const {
18062 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18064}
18065
18066const MCPhysReg *
18068 // LR is a callee-save register, but we must treat it as clobbered by any call
18069 // site. Hence we include LR in the scratch registers, which are in turn added
18070 // as implicit-defs for stackmaps and patchpoints.
18071 static const MCPhysReg ScratchRegs[] = {
18072 AArch64::X16, AArch64::X17, AArch64::LR, 0
18073 };
18074 return ScratchRegs;
18075}
18076
18078 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18079 return RCRegs;
18080}
18081
18082bool
18084 CombineLevel Level) const {
18085 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18086 N->getOpcode() == ISD::SRL) &&
18087 "Expected shift op");
18088
18089 SDValue ShiftLHS = N->getOperand(0);
18090 EVT VT = N->getValueType(0);
18091
18092 if (!ShiftLHS->hasOneUse())
18093 return false;
18094
18095 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18096 !ShiftLHS.getOperand(0)->hasOneUse())
18097 return false;
18098
18099 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18100 // combine it with shift 'N' to let it be lowered to UBFX except:
18101 // ((x >> C) & mask) << C.
18102 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18103 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18104 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18105 if (isMask_64(TruncMask)) {
18106 SDValue AndLHS = ShiftLHS.getOperand(0);
18107 if (AndLHS.getOpcode() == ISD::SRL) {
18108 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18109 if (N->getOpcode() == ISD::SHL)
18110 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18111 return SRLC->getZExtValue() == SHLC->getZExtValue();
18112 return false;
18113 }
18114 }
18115 }
18116 }
18117 return true;
18118}
18119
18121 const SDNode *N) const {
18122 assert(N->getOpcode() == ISD::XOR &&
18123 (N->getOperand(0).getOpcode() == ISD::SHL ||
18124 N->getOperand(0).getOpcode() == ISD::SRL) &&
18125 "Expected XOR(SHIFT) pattern");
18126
18127 // Only commute if the entire NOT mask is a hidden shifted mask.
18128 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18129 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18130 if (XorC && ShiftC) {
18131 unsigned MaskIdx, MaskLen;
18132 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18133 unsigned ShiftAmt = ShiftC->getZExtValue();
18134 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18135 if (N->getOperand(0).getOpcode() == ISD::SHL)
18136 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18137 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18138 }
18139 }
18140
18141 return false;
18142}
18143
18145 const SDNode *N, CombineLevel Level) const {
18146 assert(((N->getOpcode() == ISD::SHL &&
18147 N->getOperand(0).getOpcode() == ISD::SRL) ||
18148 (N->getOpcode() == ISD::SRL &&
18149 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18150 "Expected shift-shift mask");
18151 // Don't allow multiuse shift folding with the same shift amount.
18152 if (!N->getOperand(0)->hasOneUse())
18153 return false;
18154
18155 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18156 EVT VT = N->getValueType(0);
18157 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18158 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18159 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18160 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18161 }
18162
18163 // We do not need to fold when this shifting used in specific load case:
18164 // (ldr x, (add x, (shl (srl x, c1) 2)))
18165 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18166 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18167 unsigned ShlAmt = C2->getZExtValue();
18168 if (auto ShouldADD = *N->user_begin();
18169 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18170 if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18171 unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
18172 if ((1ULL << ShlAmt) == ByteVT &&
18173 isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
18174 return false;
18175 }
18176 }
18177 }
18178 }
18179
18180 return true;
18181}
18182
18184 unsigned BinOpcode, EVT VT) const {
18185 return VT.isScalableVector() && isTypeLegal(VT);
18186}
18187
18189 Type *Ty) const {
18190 assert(Ty->isIntegerTy());
18191
18192 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18193 if (BitSize == 0)
18194 return false;
18195
18196 int64_t Val = Imm.getSExtValue();
18197 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18198 return true;
18199
18200 if ((int64_t)Val < 0)
18201 Val = ~Val;
18202 if (BitSize == 32)
18203 Val &= (1LL << 32) - 1;
18204
18205 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18206 // MOVZ is free so return true for one or fewer MOVK.
18207 return Shift < 3;
18208}
18209
18211 unsigned Index) const {
18213 return false;
18214
18215 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18216}
18217
18218/// Turn vector tests of the signbit in the form of:
18219/// xor (sra X, elt_size(X)-1), -1
18220/// into:
18221/// cmge X, X, #0
18223 const AArch64Subtarget *Subtarget) {
18224 EVT VT = N->getValueType(0);
18225 if (!Subtarget->hasNEON() || !VT.isVector())
18226 return SDValue();
18227
18228 // There must be a shift right algebraic before the xor, and the xor must be a
18229 // 'not' operation.
18230 SDValue Shift = N->getOperand(0);
18231 SDValue Ones = N->getOperand(1);
18232 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18234 return SDValue();
18235
18236 // The shift should be smearing the sign bit across each vector element.
18237 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18238 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18239 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18240 return SDValue();
18241
18242 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
18243}
18244
18245// Given a vecreduce_add node, detect the below pattern and convert it to the
18246// node sequence with UABDL, [S|U]ADB and UADDLP.
18247//
18248// i32 vecreduce_add(
18249// v16i32 abs(
18250// v16i32 sub(
18251// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18252// =================>
18253// i32 vecreduce_add(
18254// v4i32 UADDLP(
18255// v8i16 add(
18256// v8i16 zext(
18257// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18258// v8i16 zext(
18259// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18261 SelectionDAG &DAG) {
18262 // Assumed i32 vecreduce_add
18263 if (N->getValueType(0) != MVT::i32)
18264 return SDValue();
18265
18266 SDValue VecReduceOp0 = N->getOperand(0);
18267 unsigned Opcode = VecReduceOp0.getOpcode();
18268 // Assumed v16i32 abs
18269 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
18270 return SDValue();
18271
18272 SDValue ABS = VecReduceOp0;
18273 // Assumed v16i32 sub
18274 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18275 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
18276 return SDValue();
18277
18278 SDValue SUB = ABS->getOperand(0);
18279 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18280 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18281 // Assumed v16i32 type
18282 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
18283 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
18284 return SDValue();
18285
18286 // Assumed zext or sext
18287 bool IsZExt = false;
18288 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18289 IsZExt = true;
18290 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18291 IsZExt = false;
18292 } else
18293 return SDValue();
18294
18295 SDValue EXT0 = SUB->getOperand(0);
18296 SDValue EXT1 = SUB->getOperand(1);
18297 // Assumed zext's operand has v16i8 type
18298 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18299 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18300 return SDValue();
18301
18302 // Pattern is dectected. Let's convert it to sequence of nodes.
18303 SDLoc DL(N);
18304
18305 // First, create the node pattern of UABD/SABD.
18306 SDValue UABDHigh8Op0 =
18307 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18308 DAG.getConstant(8, DL, MVT::i64));
18309 SDValue UABDHigh8Op1 =
18310 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18311 DAG.getConstant(8, DL, MVT::i64));
18312 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18313 UABDHigh8Op0, UABDHigh8Op1);
18314 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18315
18316 // Second, create the node pattern of UABAL.
18317 SDValue UABDLo8Op0 =
18318 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18319 DAG.getConstant(0, DL, MVT::i64));
18320 SDValue UABDLo8Op1 =
18321 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18322 DAG.getConstant(0, DL, MVT::i64));
18323 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18324 UABDLo8Op0, UABDLo8Op1);
18325 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18326 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18327
18328 // Third, create the node of UADDLP.
18329 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18330
18331 // Fourth, create the node of VECREDUCE_ADD.
18332 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18333}
18334
18335// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18336// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18337// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18338// If we have vectors larger than v16i8 we extract v16i8 vectors,
18339// Follow the same steps above to get DOT instructions concatenate them
18340// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18342 const AArch64Subtarget *ST) {
18343 if (!ST->isNeonAvailable())
18344 return SDValue();
18345
18346 if (!ST->hasDotProd())
18348
18349 SDValue Op0 = N->getOperand(0);
18350 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18351 Op0.getValueType().getVectorElementType() != MVT::i32)
18352 return SDValue();
18353
18354 unsigned ExtOpcode = Op0.getOpcode();
18355 SDValue A = Op0;
18356 SDValue B;
18357 unsigned DotOpcode;
18358 if (ExtOpcode == ISD::MUL) {
18359 A = Op0.getOperand(0);
18360 B = Op0.getOperand(1);
18361 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18362 return SDValue();
18363 auto OpCodeA = A.getOpcode();
18364 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18365 return SDValue();
18366
18367 auto OpCodeB = B.getOpcode();
18368 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18369 return SDValue();
18370
18371 if (OpCodeA == OpCodeB) {
18372 DotOpcode =
18374 } else {
18375 // Check USDOT support support
18376 if (!ST->hasMatMulInt8())
18377 return SDValue();
18378 DotOpcode = AArch64ISD::USDOT;
18379 if (OpCodeA == ISD::SIGN_EXTEND)
18380 std::swap(A, B);
18381 }
18382 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18383 DotOpcode = AArch64ISD::UDOT;
18384 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18385 DotOpcode = AArch64ISD::SDOT;
18386 } else {
18387 return SDValue();
18388 }
18389
18390 EVT Op0VT = A.getOperand(0).getValueType();
18391 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18392 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18393 if (!IsValidElementCount || !IsValidSize)
18394 return SDValue();
18395
18396 SDLoc DL(Op0);
18397 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18398 // the extend B.
18399 if (!B)
18400 B = DAG.getConstant(1, DL, Op0VT);
18401 else
18402 B = B.getOperand(0);
18403
18404 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18405 unsigned NumOfVecReduce;
18406 EVT TargetType;
18407 if (IsMultipleOf16) {
18408 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18409 TargetType = MVT::v4i32;
18410 } else {
18411 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18412 TargetType = MVT::v2i32;
18413 }
18414 // Handle the case where we need to generate only one Dot operation.
18415 if (NumOfVecReduce == 1) {
18416 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18417 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18418 A.getOperand(0), B);
18419 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18420 }
18421 // Generate Dot instructions that are multiple of 16.
18422 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18423 SmallVector<SDValue, 4> SDotVec16;
18424 unsigned I = 0;
18425 for (; I < VecReduce16Num; I += 1) {
18426 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18427 SDValue Op0 =
18428 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18429 DAG.getConstant(I * 16, DL, MVT::i64));
18430 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18431 DAG.getConstant(I * 16, DL, MVT::i64));
18432 SDValue Dot =
18433 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18434 SDotVec16.push_back(Dot);
18435 }
18436 // Concatenate dot operations.
18437 EVT SDot16EVT =
18438 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18439 SDValue ConcatSDot16 =
18440 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18441 SDValue VecReduceAdd16 =
18442 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18443 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18444 if (VecReduce8Num == 0)
18445 return VecReduceAdd16;
18446
18447 // Generate the remainder Dot operation that is multiple of 8.
18448 SmallVector<SDValue, 4> SDotVec8;
18449 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18450 SDValue Vec8Op0 =
18451 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18452 DAG.getConstant(I * 16, DL, MVT::i64));
18453 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18454 DAG.getConstant(I * 16, DL, MVT::i64));
18455 SDValue Dot =
18456 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18457 SDValue VecReudceAdd8 =
18458 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18459 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18460 VecReudceAdd8);
18461}
18462
18463// Given an (integer) vecreduce, we know the order of the inputs does not
18464// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18465// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18466// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18468 auto DetectAddExtract = [&](SDValue A) {
18469 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18470 // UADDLP(x) if found.
18471 assert(A.getOpcode() == ISD::ADD);
18472 EVT VT = A.getValueType();
18473 SDValue Op0 = A.getOperand(0);
18474 SDValue Op1 = A.getOperand(1);
18475 if (Op0.getOpcode() != Op1.getOpcode() ||
18476 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18477 Op0.getOpcode() != ISD::SIGN_EXTEND))
18478 return SDValue();
18479 SDValue Ext0 = Op0.getOperand(0);
18480 SDValue Ext1 = Op1.getOperand(0);
18481 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18483 Ext0.getOperand(0) != Ext1.getOperand(0))
18484 return SDValue();
18485 // Check that the type is twice the add types, and the extract are from
18486 // upper/lower parts of the same source.
18488 VT.getVectorNumElements() * 2)
18489 return SDValue();
18490 if ((Ext0.getConstantOperandVal(1) != 0 ||
18492 (Ext1.getConstantOperandVal(1) != 0 ||
18494 return SDValue();
18495 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18497 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18498 };
18499
18500 if (SDValue R = DetectAddExtract(A))
18501 return R;
18502
18503 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18504 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18505 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18506 A.getOperand(1));
18507 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18508 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18509 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18510 A.getOperand(0));
18511 return SDValue();
18512}
18513
18514// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18515// UADDLV(concat), where the concat represents the 64-bit zext sources.
18517 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18518 // UADDLV(concat(zext, zext)) if found.
18519 assert(A.getOpcode() == ISD::ADD);
18520 EVT VT = A.getValueType();
18521 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18522 return SDValue();
18523 SDValue Op0 = A.getOperand(0);
18524 SDValue Op1 = A.getOperand(1);
18525 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18526 return SDValue();
18527 SDValue Ext0 = Op0.getOperand(0);
18528 SDValue Ext1 = Op1.getOperand(0);
18529 EVT ExtVT0 = Ext0.getValueType();
18530 EVT ExtVT1 = Ext1.getValueType();
18531 // Check zext VTs are the same and 64-bit length.
18532 if (ExtVT0 != ExtVT1 ||
18533 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18534 return SDValue();
18535 // Get VT for concat of zext sources.
18536 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18537 SDValue Concat =
18538 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18539
18540 switch (VT.getSimpleVT().SimpleTy) {
18541 case MVT::v2i64:
18542 case MVT::v4i32:
18543 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18544 case MVT::v8i16: {
18545 SDValue Uaddlv =
18546 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18547 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18548 }
18549 default:
18550 llvm_unreachable("Unhandled vector type");
18551 }
18552}
18553
18555 SDValue A = N->getOperand(0);
18556 if (A.getOpcode() == ISD::ADD) {
18557 if (SDValue R = performUADDVAddCombine(A, DAG))
18558 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18559 else if (SDValue R = performUADDVZextCombine(A, DAG))
18560 return R;
18561 }
18562 return SDValue();
18563}
18564
18567 const AArch64Subtarget *Subtarget) {
18568 if (DCI.isBeforeLegalizeOps())
18569 return SDValue();
18570
18571 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18572}
18573
18574SDValue
18575AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18576 SelectionDAG &DAG,
18577 SmallVectorImpl<SDNode *> &Created) const {
18579 if (isIntDivCheap(N->getValueType(0), Attr))
18580 return SDValue(N, 0); // Lower SDIV as SDIV
18581
18582 EVT VT = N->getValueType(0);
18583
18584 // For scalable and fixed types, mark them as cheap so we can handle it much
18585 // later. This allows us to handle larger than legal types.
18586 if (VT.isScalableVector() ||
18587 (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
18588 return SDValue(N, 0);
18589
18590 // fold (sdiv X, pow2)
18591 if ((VT != MVT::i32 && VT != MVT::i64) ||
18592 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18593 return SDValue();
18594
18595 // If the divisor is 2 or -2, the default expansion is better. It will add
18596 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18597 if (Divisor == 2 ||
18598 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18599 return SDValue();
18600
18601 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18602}
18603
18604SDValue
18605AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18606 SelectionDAG &DAG,
18607 SmallVectorImpl<SDNode *> &Created) const {
18609 if (isIntDivCheap(N->getValueType(0), Attr))
18610 return SDValue(N, 0); // Lower SREM as SREM
18611
18612 EVT VT = N->getValueType(0);
18613
18614 // For scalable and fixed types, mark them as cheap so we can handle it much
18615 // later. This allows us to handle larger than legal types.
18616 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18617 return SDValue(N, 0);
18618
18619 // fold (srem X, pow2)
18620 if ((VT != MVT::i32 && VT != MVT::i64) ||
18621 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18622 return SDValue();
18623
18624 unsigned Lg2 = Divisor.countr_zero();
18625 if (Lg2 == 0)
18626 return SDValue();
18627
18628 SDLoc DL(N);
18629 SDValue N0 = N->getOperand(0);
18630 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18631 SDValue Zero = DAG.getConstant(0, DL, VT);
18632 SDValue CCVal, CSNeg;
18633 if (Lg2 == 1) {
18634 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
18635 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18636 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
18637
18638 Created.push_back(Cmp.getNode());
18639 Created.push_back(And.getNode());
18640 } else {
18641 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
18642 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18643
18644 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18645 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18646 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18647 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18648 Negs.getValue(1));
18649
18650 Created.push_back(Negs.getNode());
18651 Created.push_back(AndPos.getNode());
18652 Created.push_back(AndNeg.getNode());
18653 }
18654
18655 return CSNeg;
18656}
18657
18658static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18659 switch(getIntrinsicID(S.getNode())) {
18660 default:
18661 break;
18662 case Intrinsic::aarch64_sve_cntb:
18663 return 8;
18664 case Intrinsic::aarch64_sve_cnth:
18665 return 16;
18666 case Intrinsic::aarch64_sve_cntw:
18667 return 32;
18668 case Intrinsic::aarch64_sve_cntd:
18669 return 64;
18670 }
18671 return {};
18672}
18673
18674/// Calculates what the pre-extend type is, based on the extension
18675/// operation node provided by \p Extend.
18676///
18677/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18678/// pre-extend type is pulled directly from the operand, while other extend
18679/// operations need a bit more inspection to get this information.
18680///
18681/// \param Extend The SDNode from the DAG that represents the extend operation
18682///
18683/// \returns The type representing the \p Extend source type, or \p MVT::Other
18684/// if no valid type can be determined
18686 switch (Extend.getOpcode()) {
18687 case ISD::SIGN_EXTEND:
18688 case ISD::ZERO_EXTEND:
18689 case ISD::ANY_EXTEND:
18690 return Extend.getOperand(0).getValueType();
18691 case ISD::AssertSext:
18692 case ISD::AssertZext:
18694 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18695 if (!TypeNode)
18696 return MVT::Other;
18697 return TypeNode->getVT();
18698 }
18699 case ISD::AND: {
18701 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18702 if (!Constant)
18703 return MVT::Other;
18704
18705 uint32_t Mask = Constant->getZExtValue();
18706
18707 if (Mask == UCHAR_MAX)
18708 return MVT::i8;
18709 else if (Mask == USHRT_MAX)
18710 return MVT::i16;
18711 else if (Mask == UINT_MAX)
18712 return MVT::i32;
18713
18714 return MVT::Other;
18715 }
18716 default:
18717 return MVT::Other;
18718 }
18719}
18720
18721/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18722/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18723/// SExt/ZExt rather than the scalar SExt/ZExt
18725 EVT VT = BV.getValueType();
18726 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18728 return SDValue();
18729
18730 // Use the first item in the buildvector/shuffle to get the size of the
18731 // extend, and make sure it looks valid.
18732 SDValue Extend = BV->getOperand(0);
18733 unsigned ExtendOpcode = Extend.getOpcode();
18734 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18735 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18736 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18737 ExtendOpcode == ISD::AssertSext;
18738 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18739 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18740 return SDValue();
18741 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18742 // ensure calculatePreExtendType will work without issue.
18743 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18744 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18745 return SDValue();
18746
18747 // Restrict valid pre-extend data type
18748 EVT PreExtendType = calculatePreExtendType(Extend);
18749 if (PreExtendType == MVT::Other ||
18750 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18751 return SDValue();
18752
18753 // Make sure all other operands are equally extended.
18754 bool SeenZExtOrSExt = !IsAnyExt;
18755 for (SDValue Op : drop_begin(BV->ops())) {
18756 if (Op.isUndef())
18757 continue;
18758
18759 if (calculatePreExtendType(Op) != PreExtendType)
18760 return SDValue();
18761
18762 unsigned Opc = Op.getOpcode();
18763 if (Opc == ISD::ANY_EXTEND)
18764 continue;
18765
18766 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18767 Opc == ISD::AssertSext;
18768
18769 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18770 return SDValue();
18771
18772 IsSExt = OpcIsSExt;
18773 SeenZExtOrSExt = true;
18774 }
18775
18776 SDValue NBV;
18777 SDLoc DL(BV);
18778 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18779 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
18780 EVT PreExtendLegalType =
18781 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18783 for (SDValue Op : BV->ops())
18784 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
18785 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
18786 PreExtendLegalType));
18787 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
18788 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18789 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
18790 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
18791 BV.getOperand(1).isUndef()
18792 ? DAG.getUNDEF(PreExtendVT)
18793 : BV.getOperand(1).getOperand(0),
18794 cast<ShuffleVectorSDNode>(BV)->getMask());
18795 }
18796 unsigned ExtOpc = !SeenZExtOrSExt
18798 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18799 return DAG.getNode(ExtOpc, DL, VT, NBV);
18800}
18801
18802/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18803/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18805 // If the value type isn't a vector, none of the operands are going to be dups
18806 EVT VT = Mul->getValueType(0);
18807 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18808 return SDValue();
18809
18810 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
18811 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
18812
18813 // Neither operands have been changed, don't make any further changes
18814 if (!Op0 && !Op1)
18815 return SDValue();
18816
18817 SDLoc DL(Mul);
18818 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
18819 Op1 ? Op1 : Mul->getOperand(1));
18820}
18821
18822// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18823// Same for other types with equivalent constants.
18825 EVT VT = N->getValueType(0);
18826 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18827 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18828 return SDValue();
18829 if (N->getOperand(0).getOpcode() != ISD::AND ||
18830 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
18831 return SDValue();
18832
18833 SDValue And = N->getOperand(0);
18834 SDValue Srl = And.getOperand(0);
18835
18836 APInt V1, V2, V3;
18837 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
18838 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
18840 return SDValue();
18841
18842 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18843 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18844 V3 != (HalfSize - 1))
18845 return SDValue();
18846
18847 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18848 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
18849 VT.getVectorElementCount() * 2);
18850
18851 SDLoc DL(N);
18852 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
18853 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
18854 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
18855}
18856
18857// Transform vector add(zext i8 to i32, zext i8 to i32)
18858// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18859// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18860// extends.
18862 EVT VT = N->getValueType(0);
18863 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18864 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18865 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18866 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18867 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18868 N->getOperand(0).getOperand(0).getValueType() !=
18869 N->getOperand(1).getOperand(0).getValueType())
18870 return SDValue();
18871
18872 if (N->getOpcode() == ISD::MUL &&
18873 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
18874 return SDValue();
18875
18876 SDValue N0 = N->getOperand(0).getOperand(0);
18877 SDValue N1 = N->getOperand(1).getOperand(0);
18878 EVT InVT = N0.getValueType();
18879
18880 EVT S1 = InVT.getScalarType();
18881 EVT S2 = VT.getScalarType();
18882 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18883 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18884 SDLoc DL(N);
18885 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18888 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
18889 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
18890 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
18891 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
18892 : (unsigned)ISD::SIGN_EXTEND,
18893 DL, VT, NewOp);
18894 }
18895 return SDValue();
18896}
18897
18900 const AArch64Subtarget *Subtarget) {
18901
18902 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
18903 return Ext;
18905 return Ext;
18906 if (SDValue Ext = performVectorExtCombine(N, DAG))
18907 return Ext;
18908
18909 if (DCI.isBeforeLegalizeOps())
18910 return SDValue();
18911
18912 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18913 // and in MachineCombiner pass, add+mul will be combined into madd.
18914 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18915 SDLoc DL(N);
18916 EVT VT = N->getValueType(0);
18917 SDValue N0 = N->getOperand(0);
18918 SDValue N1 = N->getOperand(1);
18919 SDValue MulOper;
18920 unsigned AddSubOpc;
18921
18922 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18923 AddSubOpc = V->getOpcode();
18924 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18925 SDValue Opnd = V->getOperand(1);
18926 MulOper = V->getOperand(0);
18927 if (AddSubOpc == ISD::SUB)
18928 std::swap(Opnd, MulOper);
18929 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18930 return C->isOne();
18931 }
18932 return false;
18933 };
18934
18935 if (IsAddSubWith1(N0)) {
18936 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18937 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18938 }
18939
18940 if (IsAddSubWith1(N1)) {
18941 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18942 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18943 }
18944
18945 // The below optimizations require a constant RHS.
18946 if (!isa<ConstantSDNode>(N1))
18947 return SDValue();
18948
18949 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18950 const APInt &ConstValue = C->getAPIntValue();
18951
18952 // Allow the scaling to be folded into the `cnt` instruction by preventing
18953 // the scaling to be obscured here. This makes it easier to pattern match.
18954 if (IsSVECntIntrinsic(N0) ||
18955 (N0->getOpcode() == ISD::TRUNCATE &&
18956 (IsSVECntIntrinsic(N0->getOperand(0)))))
18957 if (ConstValue.sge(1) && ConstValue.sle(16))
18958 return SDValue();
18959
18960 // Multiplication of a power of two plus/minus one can be done more
18961 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18962 // future CPUs have a cheaper MADD instruction, this may need to be
18963 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18964 // 64-bit is 5 cycles, so this is always a win.
18965 // More aggressively, some multiplications N0 * C can be lowered to
18966 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18967 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18968 // TODO: lower more cases.
18969
18970 // TrailingZeroes is used to test if the mul can be lowered to
18971 // shift+add+shift.
18972 unsigned TrailingZeroes = ConstValue.countr_zero();
18973 if (TrailingZeroes) {
18974 // Conservatively do not lower to shift+add+shift if the mul might be
18975 // folded into smul or umul.
18976 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18977 isZeroExtended(N0, DAG)))
18978 return SDValue();
18979 // Conservatively do not lower to shift+add+shift if the mul might be
18980 // folded into madd or msub.
18981 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
18982 N->user_begin()->getOpcode() == ISD::SUB))
18983 return SDValue();
18984 }
18985 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18986 // and shift+add+shift.
18987 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18988 unsigned ShiftAmt;
18989
18990 auto Shl = [&](SDValue N0, unsigned N1) {
18991 if (!N0.getNode())
18992 return SDValue();
18993 // If shift causes overflow, ignore this combine.
18994 if (N1 >= N0.getValueSizeInBits())
18995 return SDValue();
18996 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18997 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18998 };
18999 auto Add = [&](SDValue N0, SDValue N1) {
19000 if (!N0.getNode() || !N1.getNode())
19001 return SDValue();
19002 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19003 };
19004 auto Sub = [&](SDValue N0, SDValue N1) {
19005 if (!N0.getNode() || !N1.getNode())
19006 return SDValue();
19007 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19008 };
19009 auto Negate = [&](SDValue N) {
19010 if (!N0.getNode())
19011 return SDValue();
19012 SDValue Zero = DAG.getConstant(0, DL, VT);
19013 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19014 };
19015
19016 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19017 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19018 // the (2^N - 1) can't be execused via a single instruction.
19019 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19020 unsigned BitWidth = C.getBitWidth();
19021 for (unsigned i = 1; i < BitWidth / 2; i++) {
19022 APInt Rem;
19023 APInt X(BitWidth, (1 << i) + 1);
19024 APInt::sdivrem(C, X, N, Rem);
19025 APInt NVMinus1 = N - 1;
19026 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19027 M = X;
19028 return true;
19029 }
19030 }
19031 return false;
19032 };
19033
19034 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19035 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19036 // the (2^N - 1) can't be execused via a single instruction.
19037 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19038 APInt CVMinus1 = C - 1;
19039 if (CVMinus1.isNegative())
19040 return false;
19041 unsigned TrailingZeroes = CVMinus1.countr_zero();
19042 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19043 if (SCVMinus1.isPowerOf2()) {
19044 unsigned BitWidth = SCVMinus1.getBitWidth();
19045 M = APInt(BitWidth, SCVMinus1.logBase2());
19046 N = APInt(BitWidth, TrailingZeroes);
19047 return true;
19048 }
19049 return false;
19050 };
19051
19052 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19053 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19054 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19055 APInt CVMinus1 = C - 1;
19056 if (CVMinus1.isNegative())
19057 return false;
19058 unsigned TrailingZeroes = CVMinus1.countr_zero();
19059 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19060 if (CVPlus1.isPowerOf2()) {
19061 unsigned BitWidth = CVPlus1.getBitWidth();
19062 M = APInt(BitWidth, CVPlus1.logBase2());
19063 N = APInt(BitWidth, TrailingZeroes);
19064 return true;
19065 }
19066 return false;
19067 };
19068
19069 if (ConstValue.isNonNegative()) {
19070 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19071 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19072 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19073 // (mul x, (2^M + 1) * (2^N + 1))
19074 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19075 // (mul x, (2^M + 1) * 2^N + 1))
19076 // => MV = add (shl x, M), x); add (shl MV, N), x)
19077 // (mul x, 1 - (1 - 2^M) * 2^N))
19078 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19079 APInt SCVMinus1 = ShiftedConstValue - 1;
19080 APInt SCVPlus1 = ShiftedConstValue + 1;
19081 APInt CVPlus1 = ConstValue + 1;
19082 APInt CVM, CVN;
19083 if (SCVMinus1.isPowerOf2()) {
19084 ShiftAmt = SCVMinus1.logBase2();
19085 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19086 } else if (CVPlus1.isPowerOf2()) {
19087 ShiftAmt = CVPlus1.logBase2();
19088 return Sub(Shl(N0, ShiftAmt), N0);
19089 } else if (SCVPlus1.isPowerOf2()) {
19090 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19091 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19092 }
19093 if (Subtarget->hasALULSLFast() &&
19094 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19095 APInt CVMMinus1 = CVM - 1;
19096 APInt CVNMinus1 = CVN - 1;
19097 unsigned ShiftM1 = CVMMinus1.logBase2();
19098 unsigned ShiftN1 = CVNMinus1.logBase2();
19099 // ALULSLFast implicate that Shifts <= 4 places are fast
19100 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19101 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19102 return Add(Shl(MVal, ShiftN1), MVal);
19103 }
19104 }
19105 if (Subtarget->hasALULSLFast() &&
19106 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19107 unsigned ShiftM = CVM.getZExtValue();
19108 unsigned ShiftN = CVN.getZExtValue();
19109 // ALULSLFast implicate that Shifts <= 4 places are fast
19110 if (ShiftM <= 4 && ShiftN <= 4) {
19111 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19112 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19113 }
19114 }
19115
19116 if (Subtarget->hasALULSLFast() &&
19117 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19118 unsigned ShiftM = CVM.getZExtValue();
19119 unsigned ShiftN = CVN.getZExtValue();
19120 // ALULSLFast implicate that Shifts <= 4 places are fast
19121 if (ShiftM <= 4 && ShiftN <= 4) {
19122 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19123 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19124 }
19125 }
19126 } else {
19127 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19128 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19129 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19130 APInt SCVPlus1 = -ShiftedConstValue + 1;
19131 APInt CVNegPlus1 = -ConstValue + 1;
19132 APInt CVNegMinus1 = -ConstValue - 1;
19133 if (CVNegPlus1.isPowerOf2()) {
19134 ShiftAmt = CVNegPlus1.logBase2();
19135 return Sub(N0, Shl(N0, ShiftAmt));
19136 } else if (CVNegMinus1.isPowerOf2()) {
19137 ShiftAmt = CVNegMinus1.logBase2();
19138 return Negate(Add(Shl(N0, ShiftAmt), N0));
19139 } else if (SCVPlus1.isPowerOf2()) {
19140 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19141 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19142 }
19143 }
19144
19145 return SDValue();
19146}
19147
19149 SelectionDAG &DAG) {
19150 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19151 // optimize away operation when it's from a constant.
19152 //
19153 // The general transformation is:
19154 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19155 // AND(VECTOR_CMP(x,y), constant2)
19156 // constant2 = UNARYOP(constant)
19157
19158 // Early exit if this isn't a vector operation, the operand of the
19159 // unary operation isn't a bitwise AND, or if the sizes of the operations
19160 // aren't the same.
19161 EVT VT = N->getValueType(0);
19162 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19163 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19164 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19165 return SDValue();
19166
19167 // Now check that the other operand of the AND is a constant. We could
19168 // make the transformation for non-constant splats as well, but it's unclear
19169 // that would be a benefit as it would not eliminate any operations, just
19170 // perform one more step in scalar code before moving to the vector unit.
19171 if (BuildVectorSDNode *BV =
19172 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19173 // Bail out if the vector isn't a constant.
19174 if (!BV->isConstant())
19175 return SDValue();
19176
19177 // Everything checks out. Build up the new and improved node.
19178 SDLoc DL(N);
19179 EVT IntVT = BV->getValueType(0);
19180 // Create a new constant of the appropriate type for the transformed
19181 // DAG.
19182 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19183 // The AND node needs bitcasts to/from an integer vector type around it.
19184 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19185 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19186 N->getOperand(0)->getOperand(0), MaskConst);
19187 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19188 return Res;
19189 }
19190
19191 return SDValue();
19192}
19193
19194/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19195/// functions, this can help to reduce the number of fmovs to/from GPRs.
19196static SDValue
19199 const AArch64Subtarget *Subtarget) {
19200 if (N->isStrictFPOpcode())
19201 return SDValue();
19202
19203 if (DCI.isBeforeLegalizeOps())
19204 return SDValue();
19205
19206 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19207 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19208 return SDValue();
19209
19210 auto isSupportedType = [](EVT VT) {
19211 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19212 };
19213
19214 SDValue SrcVal = N->getOperand(0);
19215 EVT SrcTy = SrcVal.getValueType();
19216 EVT DestTy = N->getValueType(0);
19217
19218 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19219 return SDValue();
19220
19221 EVT SrcVecTy;
19222 EVT DestVecTy;
19223 if (DestTy.bitsGT(SrcTy)) {
19224 DestVecTy = getPackedSVEVectorVT(DestTy);
19225 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19226 } else {
19227 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19228 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19229 }
19230
19231 // Ensure the resulting src/dest vector type is legal.
19232 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19233 return SDValue();
19234
19235 SDLoc DL(N);
19236 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19237 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19238 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19239 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19240 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19241}
19242
19245 const AArch64Subtarget *Subtarget) {
19246 // First try to optimize away the conversion when it's conditionally from
19247 // a constant. Vectors only.
19249 return Res;
19250
19251 if (SDValue Res =
19252 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19253 return Res;
19254
19255 EVT VT = N->getValueType(0);
19256 if (VT != MVT::f32 && VT != MVT::f64)
19257 return SDValue();
19258
19259 // Only optimize when the source and destination types have the same width.
19260 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19261 return SDValue();
19262
19263 // If the result of an integer load is only used by an integer-to-float
19264 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19265 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19266 SDValue N0 = N->getOperand(0);
19267 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19268 N0.hasOneUse() &&
19269 // Do not change the width of a volatile load.
19270 !cast<LoadSDNode>(N0)->isVolatile()) {
19271 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19272 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19273 LN0->getPointerInfo(), LN0->getAlign(),
19274 LN0->getMemOperand()->getFlags());
19275
19276 // Make sure successors of the original load stay after it by updating them
19277 // to use the new Chain.
19278 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19279
19280 unsigned Opcode =
19282 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19283 }
19284
19285 return SDValue();
19286}
19287
19288/// Fold a floating-point multiply by power of two into floating-point to
19289/// fixed-point conversion.
19292 const AArch64Subtarget *Subtarget) {
19293 if (SDValue Res =
19294 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19295 return Res;
19296
19297 if (!Subtarget->isNeonAvailable())
19298 return SDValue();
19299
19300 if (!N->getValueType(0).isSimple())
19301 return SDValue();
19302
19303 SDValue Op = N->getOperand(0);
19304 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19305 return SDValue();
19306
19307 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19308 return SDValue();
19309
19310 SDValue ConstVec = Op->getOperand(1);
19311 if (!isa<BuildVectorSDNode>(ConstVec))
19312 return SDValue();
19313
19314 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19315 uint32_t FloatBits = FloatTy.getSizeInBits();
19316 if (FloatBits != 32 && FloatBits != 64 &&
19317 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19318 return SDValue();
19319
19320 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19321 uint32_t IntBits = IntTy.getSizeInBits();
19322 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19323 return SDValue();
19324
19325 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19326 if (IntBits > FloatBits)
19327 return SDValue();
19328
19329 BitVector UndefElements;
19330 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19331 int32_t Bits = IntBits == 64 ? 64 : 32;
19332 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19333 if (C == -1 || C == 0 || C > Bits)
19334 return SDValue();
19335
19336 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19337 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19338 return SDValue();
19339
19340 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19341 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19342 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19343 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19344 return SDValue();
19345 }
19346
19347 SDLoc DL(N);
19348 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19349 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19350 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19351 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19352 SDValue FixConv =
19354 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19355 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19356 // We can handle smaller integers by generating an extra trunc.
19357 if (IntBits < FloatBits)
19358 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19359
19360 return FixConv;
19361}
19362
19364 const AArch64TargetLowering &TLI) {
19365 EVT VT = N->getValueType(0);
19366 SelectionDAG &DAG = DCI.DAG;
19367 SDLoc DL(N);
19368 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19369
19370 if (!VT.isVector())
19371 return SDValue();
19372
19373 if (VT.isScalableVector() && !Subtarget.hasSVE2())
19374 return SDValue();
19375
19376 if (VT.isFixedLengthVector() &&
19377 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
19378 return SDValue();
19379
19380 SDValue N0 = N->getOperand(0);
19381 if (N0.getOpcode() != ISD::AND)
19382 return SDValue();
19383
19384 SDValue N1 = N->getOperand(1);
19385 if (N1.getOpcode() != ISD::AND)
19386 return SDValue();
19387
19388 // InstCombine does (not (neg a)) => (add a -1).
19389 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19390 // Loop over all combinations of AND operands.
19391 for (int i = 1; i >= 0; --i) {
19392 for (int j = 1; j >= 0; --j) {
19393 SDValue O0 = N0->getOperand(i);
19394 SDValue O1 = N1->getOperand(j);
19395 SDValue Sub, Add, SubSibling, AddSibling;
19396
19397 // Find a SUB and an ADD operand, one from each AND.
19398 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19399 Sub = O0;
19400 Add = O1;
19401 SubSibling = N0->getOperand(1 - i);
19402 AddSibling = N1->getOperand(1 - j);
19403 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19404 Add = O0;
19405 Sub = O1;
19406 AddSibling = N0->getOperand(1 - i);
19407 SubSibling = N1->getOperand(1 - j);
19408 } else
19409 continue;
19410
19412 continue;
19413
19414 // Constant ones is always righthand operand of the Add.
19415 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
19416 continue;
19417
19418 if (Sub.getOperand(1) != Add.getOperand(0))
19419 continue;
19420
19421 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
19422 }
19423 }
19424
19425 // (or (and a b) (and (not a) c)) => (bsl a b c)
19426 // We only have to look for constant vectors here since the general, variable
19427 // case can be handled in TableGen.
19428 unsigned Bits = VT.getScalarSizeInBits();
19429 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
19430 for (int i = 1; i >= 0; --i)
19431 for (int j = 1; j >= 0; --j) {
19432 APInt Val1, Val2;
19433
19434 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
19436 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19437 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19438 N0->getOperand(1 - i), N1->getOperand(1 - j));
19439 }
19440 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
19441 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
19442 if (!BVN0 || !BVN1)
19443 continue;
19444
19445 bool FoundMatch = true;
19446 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
19447 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
19448 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
19449 if (!CN0 || !CN1 ||
19450 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19451 FoundMatch = false;
19452 break;
19453 }
19454 }
19455 if (FoundMatch)
19456 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19457 N0->getOperand(1 - i), N1->getOperand(1 - j));
19458 }
19459
19460 return SDValue();
19461}
19462
19463// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19464// convert to csel(ccmp(.., cc0)), depending on cc1:
19465
19466// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19467// =>
19468// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19469//
19470// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19471// =>
19472// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19474 EVT VT = N->getValueType(0);
19475 SDValue CSel0 = N->getOperand(0);
19476 SDValue CSel1 = N->getOperand(1);
19477
19478 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19479 CSel1.getOpcode() != AArch64ISD::CSEL)
19480 return SDValue();
19481
19482 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19483 return SDValue();
19484
19485 if (!isNullConstant(CSel0.getOperand(0)) ||
19486 !isOneConstant(CSel0.getOperand(1)) ||
19487 !isNullConstant(CSel1.getOperand(0)) ||
19488 !isOneConstant(CSel1.getOperand(1)))
19489 return SDValue();
19490
19491 SDValue Cmp0 = CSel0.getOperand(3);
19492 SDValue Cmp1 = CSel1.getOperand(3);
19495 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19496 return SDValue();
19497 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19498 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19499 std::swap(Cmp0, Cmp1);
19500 std::swap(CC0, CC1);
19501 }
19502
19503 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19504 return SDValue();
19505
19506 SDLoc DL(N);
19507 SDValue CCmp, Condition;
19508 unsigned NZCV;
19509
19510 if (N->getOpcode() == ISD::AND) {
19512 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
19514 } else {
19516 Condition = DAG.getConstant(CC0, DL, MVT_CC);
19518 }
19519
19520 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19521
19522 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19523 if (Op1 && Op1->getAPIntValue().isNegative() &&
19524 Op1->getAPIntValue().sgt(-32)) {
19525 // CCMP accept the constant int the range [0, 31]
19526 // if the Op1 is a constant in the range [-31, -1], we
19527 // can select to CCMN to avoid the extra mov
19528 SDValue AbsOp1 =
19529 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19530 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
19531 NZCVOp, Condition, Cmp0);
19532 } else {
19533 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
19534 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19535 }
19536 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19537 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
19538 CCmp);
19539}
19540
19542 const AArch64Subtarget *Subtarget,
19543 const AArch64TargetLowering &TLI) {
19544 SelectionDAG &DAG = DCI.DAG;
19545 EVT VT = N->getValueType(0);
19546
19547 if (SDValue R = performANDORCSELCombine(N, DAG))
19548 return R;
19549
19550 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19551 return SDValue();
19552
19553 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19554 return Res;
19555
19556 return SDValue();
19557}
19558
19560 if (!MemVT.getVectorElementType().isSimple())
19561 return false;
19562
19563 uint64_t MaskForTy = 0ull;
19564 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19565 case MVT::i8:
19566 MaskForTy = 0xffull;
19567 break;
19568 case MVT::i16:
19569 MaskForTy = 0xffffull;
19570 break;
19571 case MVT::i32:
19572 MaskForTy = 0xffffffffull;
19573 break;
19574 default:
19575 return false;
19576 break;
19577 }
19578
19579 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19580 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19581 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19582
19583 return false;
19584}
19585
19587 SDValue LeafOp = SDValue(N, 0);
19588 SDValue Op = N->getOperand(0);
19589 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19590 LeafOp.getValueType() != Op.getValueType())
19591 Op = Op->getOperand(0);
19592 if (LeafOp.getValueType() == Op.getValueType())
19593 return Op;
19594 return SDValue();
19595}
19596
19599 SelectionDAG &DAG = DCI.DAG;
19600 SDValue Src = N->getOperand(0);
19601 unsigned Opc = Src->getOpcode();
19602
19603 // Zero/any extend of an unsigned unpack
19604 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19605 SDValue UnpkOp = Src->getOperand(0);
19606 SDValue Dup = N->getOperand(1);
19607
19608 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19609 return SDValue();
19610
19611 SDLoc DL(N);
19612 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19613 if (!C)
19614 return SDValue();
19615
19616 uint64_t ExtVal = C->getZExtValue();
19617
19618 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19619 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19620 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19621 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19622 };
19623
19624 // If the mask is fully covered by the unpack, we don't need to push
19625 // a new AND onto the operand
19626 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19627 if (MaskAndTypeMatch(EltTy))
19628 return Src;
19629
19630 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19631 // to see if the mask is all-ones of size MemTy.
19632 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19633 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19634 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19635 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19636 if (MaskAndTypeMatch(EltTy))
19637 return Src;
19638 }
19639
19640 // Truncate to prevent a DUP with an over wide constant
19641 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19642
19643 // Otherwise, make sure we propagate the AND to the operand
19644 // of the unpack
19645 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19646 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19647
19648 SDValue And = DAG.getNode(ISD::AND, DL,
19649 UnpkOp->getValueType(0), UnpkOp, Dup);
19650
19651 return DAG.getNode(Opc, DL, N->getValueType(0), And);
19652 }
19653
19654 if (DCI.isBeforeLegalizeOps())
19655 return SDValue();
19656
19657 // If both sides of AND operations are i1 splat_vectors then
19658 // we can produce just i1 splat_vector as the result.
19659 if (isAllActivePredicate(DAG, N->getOperand(0)))
19660 return N->getOperand(1);
19661 if (isAllActivePredicate(DAG, N->getOperand(1)))
19662 return N->getOperand(0);
19663
19665 return SDValue();
19666
19667 SDValue Mask = N->getOperand(1);
19668
19669 if (!Src.hasOneUse())
19670 return SDValue();
19671
19672 EVT MemVT;
19673
19674 // SVE load instructions perform an implicit zero-extend, which makes them
19675 // perfect candidates for combining.
19676 switch (Opc) {
19680 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19681 break;
19697 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19698 break;
19699 default:
19700 return SDValue();
19701 }
19702
19703 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19704 return Src;
19705
19706 return SDValue();
19707}
19708
19709// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19712
19713 // This function performs an optimization on a specific pattern involving
19714 // an AND operation and SETCC (Set Condition Code) node.
19715
19716 SDValue SetCC = N->getOperand(0);
19717 EVT VT = N->getValueType(0);
19718 SelectionDAG &DAG = DCI.DAG;
19719
19720 // Checks if the current node (N) is used by any SELECT instruction and
19721 // returns an empty SDValue to avoid applying the optimization to prevent
19722 // incorrect results
19723 for (auto U : N->users())
19724 if (U->getOpcode() == ISD::SELECT)
19725 return SDValue();
19726
19727 // Check if the operand is a SETCC node with floating-point comparison
19728 if (SetCC.getOpcode() == ISD::SETCC &&
19729 SetCC.getOperand(0).getValueType() == MVT::f32) {
19730
19731 SDValue Cmp;
19733
19734 // Check if the DAG is after legalization and if we can emit the conjunction
19735 if (!DCI.isBeforeLegalize() &&
19736 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19737
19739
19740 SDLoc DL(N);
19741 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19742 DAG.getConstant(0, DL, VT),
19743 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
19744 }
19745 }
19746 return SDValue();
19747}
19748
19751 SelectionDAG &DAG = DCI.DAG;
19752 SDValue LHS = N->getOperand(0);
19753 SDValue RHS = N->getOperand(1);
19754 EVT VT = N->getValueType(0);
19755
19756 if (SDValue R = performANDORCSELCombine(N, DAG))
19757 return R;
19758
19759 if (SDValue R = performANDSETCCCombine(N,DCI))
19760 return R;
19761
19762 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19763 return SDValue();
19764
19765 if (VT.isScalableVector())
19766 return performSVEAndCombine(N, DCI);
19767
19768 // The combining code below works only for NEON vectors. In particular, it
19769 // does not work for SVE when dealing with vectors wider than 128 bits.
19770 if (!VT.is64BitVector() && !VT.is128BitVector())
19771 return SDValue();
19772
19773 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19774 if (!BVN)
19775 return SDValue();
19776
19777 // AND does not accept an immediate, so check if we can use a BIC immediate
19778 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19779 // pattern in isel, because some immediates may be lowered to the preferred
19780 // (and x, (movi imm)) form, even though an mvni representation also exists.
19781 APInt DefBits(VT.getSizeInBits(), 0);
19782 APInt UndefBits(VT.getSizeInBits(), 0);
19783 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19784 SDValue NewOp;
19785
19786 // Any bits known to already be 0 need not be cleared again, which can help
19787 // reduce the size of the immediate to one supported by the instruction.
19788 KnownBits Known = DAG.computeKnownBits(LHS);
19789 APInt ZeroSplat(VT.getSizeInBits(), 0);
19790 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19791 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19792 << (Known.Zero.getBitWidth() * I);
19793
19794 DefBits = ~(DefBits | ZeroSplat);
19795 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19796 DefBits, &LHS)) ||
19797 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19798 DefBits, &LHS)))
19799 return NewOp;
19800
19801 UndefBits = ~(UndefBits | ZeroSplat);
19802 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19803 UndefBits, &LHS)) ||
19804 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19805 UndefBits, &LHS)))
19806 return NewOp;
19807 }
19808
19809 return SDValue();
19810}
19811
19814 SelectionDAG &DAG = DCI.DAG;
19815 SDValue LHS = N->getOperand(0);
19816 SDValue RHS = N->getOperand(1);
19817 EVT VT = N->getValueType(0);
19818 SDLoc DL(N);
19819
19820 if (!N->getFlags().hasAllowReassociation())
19821 return SDValue();
19822
19823 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19824 auto ReassocComplex = [&](SDValue A, SDValue B) {
19825 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19826 return SDValue();
19827 unsigned Opc = A.getConstantOperandVal(0);
19828 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19829 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19830 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19831 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19832 return SDValue();
19833 SDValue VCMLA = DAG.getNode(
19834 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19835 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19836 A.getOperand(2), A.getOperand(3));
19837 VCMLA->setFlags(A->getFlags());
19838 return VCMLA;
19839 };
19840 if (SDValue R = ReassocComplex(LHS, RHS))
19841 return R;
19842 if (SDValue R = ReassocComplex(RHS, LHS))
19843 return R;
19844
19845 return SDValue();
19846}
19847
19848static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19849 switch (Opcode) {
19850 case ISD::STRICT_FADD:
19851 case ISD::FADD:
19852 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19853 case ISD::ADD:
19854 return VT == MVT::i64;
19855 default:
19856 return false;
19857 }
19858}
19859
19860static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19862
19864 if ((N.getOpcode() == ISD::SETCC) ||
19865 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19866 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
19867 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
19868 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
19869 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
19870 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
19871 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
19872 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19873 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
19874 // get_active_lane_mask is lowered to a whilelo instruction.
19875 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19876 return true;
19877
19878 return false;
19879}
19880
19881// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19882// ... into: "ptrue p, all" + PTEST
19883static SDValue
19886 const AArch64Subtarget *Subtarget) {
19887 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19888 // Make sure PTEST can be legalised with illegal types.
19889 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19890 return SDValue();
19891
19892 SDValue N0 = N->getOperand(0);
19893 EVT VT = N0.getValueType();
19894
19895 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19896 !isNullConstant(N->getOperand(1)))
19897 return SDValue();
19898
19899 // Restricted the DAG combine to only cases where we're extracting from a
19900 // flag-setting operation.
19901 if (!isPredicateCCSettingOp(N0))
19902 return SDValue();
19903
19904 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19905 SelectionDAG &DAG = DCI.DAG;
19906 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
19907 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
19908}
19909
19910// Materialize : Idx = (add (mul vscale, NumEls), -1)
19911// i1 = extract_vector_elt t37, Constant:i64<Idx>
19912// ... into: "ptrue p, all" + PTEST
19913static SDValue
19916 const AArch64Subtarget *Subtarget) {
19917 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19918 // Make sure PTEST is legal types.
19919 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19920 return SDValue();
19921
19922 SDValue N0 = N->getOperand(0);
19923 EVT OpVT = N0.getValueType();
19924
19925 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19926 return SDValue();
19927
19928 // Idx == (add (mul vscale, NumEls), -1)
19929 SDValue Idx = N->getOperand(1);
19930 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
19931 return SDValue();
19932
19933 SDValue VS = Idx.getOperand(0);
19934 if (VS.getOpcode() != ISD::VSCALE)
19935 return SDValue();
19936
19937 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19938 if (VS.getConstantOperandVal(0) != NumEls)
19939 return SDValue();
19940
19941 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19942 SelectionDAG &DAG = DCI.DAG;
19943 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
19944 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
19945}
19946
19947static SDValue
19949 const AArch64Subtarget *Subtarget) {
19950 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19951 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19952 return Res;
19953 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19954 return Res;
19955
19956 SelectionDAG &DAG = DCI.DAG;
19957 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19958
19959 EVT VT = N->getValueType(0);
19960 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19961 bool IsStrict = N0->isStrictFPOpcode();
19962
19963 // extract(dup x) -> x
19964 if (N0.getOpcode() == AArch64ISD::DUP)
19965 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
19966 : N0.getOperand(0);
19967
19968 // Rewrite for pairwise fadd pattern
19969 // (f32 (extract_vector_elt
19970 // (fadd (vXf32 Other)
19971 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19972 // ->
19973 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19974 // (extract_vector_elt (vXf32 Other) 1))
19975 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19976 // we can only do this when it's used only by the extract_vector_elt.
19977 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
19978 (!IsStrict || N0.hasOneUse())) {
19979 SDLoc DL(N0);
19980 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
19981 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19982
19983 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19984 SDValue Other = N00;
19985
19986 // And handle the commutative case.
19987 if (!Shuffle) {
19988 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19989 Other = N01;
19990 }
19991
19992 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19993 Other == Shuffle->getOperand(0)) {
19994 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19995 DAG.getConstant(0, DL, MVT::i64));
19996 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19997 DAG.getConstant(1, DL, MVT::i64));
19998 if (!IsStrict)
19999 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20000
20001 // For strict_fadd we need uses of the final extract_vector to be replaced
20002 // with the strict_fadd, but we also need uses of the chain output of the
20003 // original strict_fadd to use the chain output of the new strict_fadd as
20004 // otherwise it may not be deleted.
20005 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20006 {VT, MVT::Other},
20007 {N0->getOperand(0), Extract1, Extract2});
20008 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20009 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20010 return SDValue(N, 0);
20011 }
20012 }
20013
20014 return SDValue();
20015}
20016
20019 SelectionDAG &DAG) {
20020 SDLoc dl(N);
20021 EVT VT = N->getValueType(0);
20022 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20023 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20024
20025 if (VT.isScalableVector())
20026 return SDValue();
20027
20028 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20029 N1Opc == ISD::TRUNCATE) {
20030 SDValue N00 = N0->getOperand(0);
20031 SDValue N10 = N1->getOperand(0);
20032 EVT N00VT = N00.getValueType();
20033 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20034
20035 // Optimize concat_vectors of truncated vectors, where the intermediate
20036 // type is illegal, to avoid said illegality, e.g.,
20037 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20038 // (v2i16 (truncate (v2i64)))))
20039 // ->
20040 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20041 // (v4i32 (bitcast (v2i64))),
20042 // <0, 2, 4, 6>)))
20043 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20044 // on both input and result type, so we might generate worse code.
20045 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20046 if (N00VT == N10.getValueType() &&
20047 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20048 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20049 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20051 for (size_t i = 0; i < Mask.size(); ++i)
20052 Mask[i] = i * 2;
20053 return DAG.getNode(ISD::TRUNCATE, dl, VT,
20054 DAG.getVectorShuffle(
20055 MidVT, dl,
20056 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
20057 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
20058 }
20059
20060 // Optimize two large shifts and a combine into a single combine and shift
20061 // For AArch64 architectures, sequences like the following:
20062 //
20063 // ushr v0.4s, v0.4s, #20
20064 // ushr v1.4s, v1.4s, #20
20065 // uzp1 v0.8h, v0.8h, v1.8h
20066 //
20067 // Can be optimized to:
20068 //
20069 // uzp2 v0.8h, v0.8h, v1.8h
20070 // ushr v0.8h, v0.8h, #4
20071 //
20072 // This optimization reduces instruction count.
20073 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20074 N00->getOperand(1) == N10->getOperand(1)) {
20075 SDValue N000 = N00->getOperand(0);
20076 SDValue N100 = N10->getOperand(0);
20077 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20078 N101ConstVal = N10->getConstantOperandVal(1),
20079 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20080
20081 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20082 N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000);
20083 N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100);
20084 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
20085 SDValue NewShiftConstant =
20086 DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
20087
20088 return DAG.getNode(AArch64ISD::VLSHR, dl, VT, Uzp, NewShiftConstant);
20089 }
20090 }
20091 }
20092
20093 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20094 N->getOperand(0).getValueType() == MVT::v2i16 ||
20095 N->getOperand(0).getValueType() == MVT::v2i8) {
20096 EVT SrcVT = N->getOperand(0).getValueType();
20097 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20098 // loads to prevent having to go through the v4i8 load legalization that
20099 // needs to extend each element into a larger type.
20100 if (N->getNumOperands() % 2 == 0 &&
20101 all_of(N->op_values(), [SrcVT](SDValue V) {
20102 if (V.getValueType() != SrcVT)
20103 return false;
20104 if (V.isUndef())
20105 return true;
20106 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20107 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20108 LD->getExtensionType() == ISD::NON_EXTLOAD;
20109 })) {
20110 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20111 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20113
20114 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20115 SDValue V = N->getOperand(i);
20116 if (V.isUndef())
20117 Ops.push_back(DAG.getUNDEF(FVT));
20118 else {
20119 LoadSDNode *LD = cast<LoadSDNode>(V);
20120 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
20121 LD->getBasePtr(), LD->getMemOperand());
20122 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20123 Ops.push_back(NewLoad);
20124 }
20125 }
20126 return DAG.getBitcast(N->getValueType(0),
20127 DAG.getBuildVector(NVT, dl, Ops));
20128 }
20129 }
20130
20131 // Canonicalise concat_vectors to replace concatenations of truncated nots
20132 // with nots of concatenated truncates. This in some cases allows for multiple
20133 // redundant negations to be eliminated.
20134 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20135 // (v4i16 (truncate (not (v4i32)))))
20136 // ->
20137 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20138 // (v4i16 (truncate (v4i32)))))
20139 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20140 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20141 N->isOnlyUserOf(N1.getNode())) {
20142 auto isBitwiseVectorNegate = [](SDValue V) {
20143 return V->getOpcode() == ISD::XOR &&
20144 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20145 };
20146 SDValue N00 = N0->getOperand(0);
20147 SDValue N10 = N1->getOperand(0);
20148 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20149 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20150 return DAG.getNOT(
20151 dl,
20152 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20153 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
20154 N00->getOperand(0)),
20155 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
20156 N10->getOperand(0))),
20157 VT);
20158 }
20159 }
20160
20161 // Wait till after everything is legalized to try this. That way we have
20162 // legal vector types and such.
20163 if (DCI.isBeforeLegalizeOps())
20164 return SDValue();
20165
20166 // Optimise concat_vectors of two identical binops with a 128-bit destination
20167 // size, combine into an binop of two contacts of the source vectors. eg:
20168 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20169 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20170 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
20171 N1->hasOneUse()) {
20172 SDValue N00 = N0->getOperand(0);
20173 SDValue N01 = N0->getOperand(1);
20174 SDValue N10 = N1->getOperand(0);
20175 SDValue N11 = N1->getOperand(1);
20176
20177 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20178 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
20179 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
20180 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
20181 }
20182 }
20183
20184 auto IsRSHRN = [](SDValue Shr) {
20185 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20186 return false;
20187 SDValue Op = Shr.getOperand(0);
20188 EVT VT = Op.getValueType();
20189 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20190 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20191 return false;
20192
20193 APInt Imm;
20194 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20195 Imm = APInt(VT.getScalarSizeInBits(),
20196 Op.getOperand(1).getConstantOperandVal(0)
20197 << Op.getOperand(1).getConstantOperandVal(1));
20198 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20199 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20200 Imm = APInt(VT.getScalarSizeInBits(),
20201 Op.getOperand(1).getConstantOperandVal(0));
20202 else
20203 return false;
20204
20205 if (Imm != 1ULL << (ShtAmt - 1))
20206 return false;
20207 return true;
20208 };
20209
20210 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20211 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20212 ((IsRSHRN(N1) &&
20214 N1.isUndef())) {
20215 SDValue X = N0.getOperand(0).getOperand(0);
20216 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20217 : N1.getOperand(0).getOperand(0);
20218 EVT BVT =
20219 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20220 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
20221 SDValue Add = DAG.getNode(
20222 ISD::ADD, dl, BVT, CC,
20223 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
20224 SDValue Shr =
20225 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
20226 return Shr;
20227 }
20228
20229 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20230 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20231 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20232 N0.getOperand(1) == N1.getOperand(1)) {
20233 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
20234 DAG.getUNDEF(N0.getValueType()));
20235 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
20236 DAG.getUNDEF(N0.getValueType()));
20237 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
20238 }
20239
20240 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20241 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20242 // canonicalise to that.
20243 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20244 assert(VT.getScalarSizeInBits() == 64);
20245 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
20246 DAG.getConstant(0, dl, MVT::i64));
20247 }
20248
20249 // Canonicalise concat_vectors so that the right-hand vector has as few
20250 // bit-casts as possible before its real operation. The primary matching
20251 // destination for these operations will be the narrowing "2" instructions,
20252 // which depend on the operation being performed on this right-hand vector.
20253 // For example,
20254 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20255 // becomes
20256 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20257
20258 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20259 return SDValue();
20260 SDValue RHS = N1->getOperand(0);
20261 MVT RHSTy = RHS.getValueType().getSimpleVT();
20262 // If the RHS is not a vector, this is not the pattern we're looking for.
20263 if (!RHSTy.isVector())
20264 return SDValue();
20265
20266 LLVM_DEBUG(
20267 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20268
20269 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20270 RHSTy.getVectorNumElements() * 2);
20271 return DAG.getNode(ISD::BITCAST, dl, VT,
20272 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
20273 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
20274 RHS));
20275}
20276
20277static SDValue
20279 SelectionDAG &DAG) {
20280 if (DCI.isBeforeLegalizeOps())
20281 return SDValue();
20282
20283 EVT VT = N->getValueType(0);
20284 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20285 return SDValue();
20286
20287 SDValue V = N->getOperand(0);
20288
20289 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20290 // blocks this combine because the non-const case requires custom lowering.
20291 //
20292 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20293 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20294 if (isa<ConstantSDNode>(V.getOperand(0)))
20295 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20296
20297 return SDValue();
20298}
20299
20300static SDValue
20302 SelectionDAG &DAG) {
20303 SDLoc DL(N);
20304 SDValue Vec = N->getOperand(0);
20305 SDValue SubVec = N->getOperand(1);
20306 uint64_t IdxVal = N->getConstantOperandVal(2);
20307 EVT VecVT = Vec.getValueType();
20308 EVT SubVT = SubVec.getValueType();
20309
20310 // Only do this for legal fixed vector types.
20311 if (!VecVT.isFixedLengthVector() ||
20312 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20313 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20314 return SDValue();
20315
20316 // Ignore widening patterns.
20317 if (IdxVal == 0 && Vec.isUndef())
20318 return SDValue();
20319
20320 // Subvector must be half the width and an "aligned" insertion.
20321 unsigned NumSubElts = SubVT.getVectorNumElements();
20322 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20323 (IdxVal != 0 && IdxVal != NumSubElts))
20324 return SDValue();
20325
20326 // Fold insert_subvector -> concat_vectors
20327 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20328 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20329 SDValue Lo, Hi;
20330 if (IdxVal == 0) {
20331 Lo = SubVec;
20332 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20333 DAG.getVectorIdxConstant(NumSubElts, DL));
20334 } else {
20335 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20336 DAG.getVectorIdxConstant(0, DL));
20337 Hi = SubVec;
20338 }
20339 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20340}
20341
20344 SelectionDAG &DAG) {
20345 // Wait until after everything is legalized to try this. That way we have
20346 // legal vector types and such.
20347 if (DCI.isBeforeLegalizeOps())
20348 return SDValue();
20349 // Transform a scalar conversion of a value from a lane extract into a
20350 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20351 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20352 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20353 //
20354 // The second form interacts better with instruction selection and the
20355 // register allocator to avoid cross-class register copies that aren't
20356 // coalescable due to a lane reference.
20357
20358 // Check the operand and see if it originates from a lane extract.
20359 SDValue Op1 = N->getOperand(1);
20361 return SDValue();
20362
20363 // Yep, no additional predication needed. Perform the transform.
20364 SDValue IID = N->getOperand(0);
20365 SDValue Shift = N->getOperand(2);
20366 SDValue Vec = Op1.getOperand(0);
20367 SDValue Lane = Op1.getOperand(1);
20368 EVT ResTy = N->getValueType(0);
20369 EVT VecResTy;
20370 SDLoc DL(N);
20371
20372 // The vector width should be 128 bits by the time we get here, even
20373 // if it started as 64 bits (the extract_vector handling will have
20374 // done so). Bail if it is not.
20375 if (Vec.getValueSizeInBits() != 128)
20376 return SDValue();
20377
20378 if (Vec.getValueType() == MVT::v4i32)
20379 VecResTy = MVT::v4f32;
20380 else if (Vec.getValueType() == MVT::v2i64)
20381 VecResTy = MVT::v2f64;
20382 else
20383 return SDValue();
20384
20385 SDValue Convert =
20386 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20387 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20388}
20389
20390// AArch64 high-vector "long" operations are formed by performing the non-high
20391// version on an extract_subvector of each operand which gets the high half:
20392//
20393// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20394//
20395// However, there are cases which don't have an extract_high explicitly, but
20396// have another operation that can be made compatible with one for free. For
20397// example:
20398//
20399// (dupv64 scalar) --> (extract_high (dup128 scalar))
20400//
20401// This routine does the actual conversion of such DUPs, once outer routines
20402// have determined that everything else is in order.
20403// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20404// similarly here.
20406 MVT VT = N.getSimpleValueType();
20407 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20408 N.getConstantOperandVal(1) == 0)
20409 N = N.getOperand(0);
20410
20411 switch (N.getOpcode()) {
20412 case AArch64ISD::DUP:
20417 case AArch64ISD::MOVI:
20423 break;
20424 default:
20425 // FMOV could be supported, but isn't very useful, as it would only occur
20426 // if you passed a bitcast' floating point immediate to an eligible long
20427 // integer op (addl, smull, ...).
20428 return SDValue();
20429 }
20430
20431 if (!VT.is64BitVector())
20432 return SDValue();
20433
20434 SDLoc DL(N);
20435 unsigned NumElems = VT.getVectorNumElements();
20436 if (N.getValueType().is64BitVector()) {
20437 MVT ElementTy = VT.getVectorElementType();
20438 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20439 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20440 }
20441
20442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20443 DAG.getConstant(NumElems, DL, MVT::i64));
20444}
20445
20447 if (N.getOpcode() == ISD::BITCAST)
20448 N = N.getOperand(0);
20449 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20450 return false;
20451 if (N.getOperand(0).getValueType().isScalableVector())
20452 return false;
20453 return N.getConstantOperandAPInt(1) ==
20454 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20455}
20456
20457/// Helper structure to keep track of ISD::SET_CC operands.
20462};
20463
20464/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20466 const SDValue *Cmp;
20468};
20469
20470/// Helper structure to keep track of SetCC information.
20474};
20475
20476/// Helper structure to be able to read SetCC information. If set to
20477/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20478/// GenericSetCCInfo.
20482};
20483
20484/// Check whether or not \p Op is a SET_CC operation, either a generic or
20485/// an
20486/// AArch64 lowered one.
20487/// \p SetCCInfo is filled accordingly.
20488/// \post SetCCInfo is meanginfull only when this function returns true.
20489/// \return True when Op is a kind of SET_CC operation.
20491 // If this is a setcc, this is straight forward.
20492 if (Op.getOpcode() == ISD::SETCC) {
20493 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20494 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20495 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20496 SetCCInfo.IsAArch64 = false;
20497 return true;
20498 }
20499 // Otherwise, check if this is a matching csel instruction.
20500 // In other words:
20501 // - csel 1, 0, cc
20502 // - csel 0, 1, !cc
20503 if (Op.getOpcode() != AArch64ISD::CSEL)
20504 return false;
20505 // Set the information about the operands.
20506 // TODO: we want the operands of the Cmp not the csel
20507 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20508 SetCCInfo.IsAArch64 = true;
20509 SetCCInfo.Info.AArch64.CC =
20510 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20511
20512 // Check that the operands matches the constraints:
20513 // (1) Both operands must be constants.
20514 // (2) One must be 1 and the other must be 0.
20515 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20516 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20517
20518 // Check (1).
20519 if (!TValue || !FValue)
20520 return false;
20521
20522 // Check (2).
20523 if (!TValue->isOne()) {
20524 // Update the comparison when we are interested in !cc.
20525 std::swap(TValue, FValue);
20526 SetCCInfo.Info.AArch64.CC =
20528 }
20529 return TValue->isOne() && FValue->isZero();
20530}
20531
20532// Returns true if Op is setcc or zext of setcc.