LLVM 20.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/Statistic.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
82#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
588 if (Subtarget->hasFPARMv8()) {
591 }
594 if (Subtarget->hasFPARMv8()) {
597 }
600
605
606 // Variable arguments.
611
612 // Variable-sized objects.
615
616 // Lowering Funnel Shifts to EXTR
621
623
624 // Constant pool entries
626
627 // BlockAddress
629
630 // AArch64 lacks both left-rotate and popcount instructions.
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
647
648 if (Subtarget->hasCSSC()) {
652
654
658
661
666
671 } else {
675
678
681 }
682
688 }
695
696 // Custom lower Add/Sub/Mul with overflow.
709
718
727 if (Subtarget->hasFullFP16()) {
730 } else {
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
748 setOperationAction(Op, MVT::f16, Promote);
749 setOperationAction(Op, MVT::v4f16, Expand);
750 setOperationAction(Op, MVT::v8f16, Expand);
751 setOperationAction(Op, MVT::bf16, Promote);
752 setOperationAction(Op, MVT::v4bf16, Expand);
753 setOperationAction(Op, MVT::v8bf16, Expand);
754 }
755
756 // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
763
764 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
765 for (auto Op : {
769 ISD::FADD,
770 ISD::FSUB,
771 ISD::FMUL,
772 ISD::FDIV,
773 ISD::FMA,
804 })
805 setOperationAction(Op, ScalarVT, Promote);
806
807 for (auto Op : {ISD::FNEG, ISD::FABS})
808 setOperationAction(Op, ScalarVT, Legal);
809
810 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
811 // because the result type is integer.
815 setOperationAction(Op, ScalarVT, Custom);
816
817 // promote v4f16 to v4f32 when that is known to be safe.
818 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
819 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
820 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
821 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
822 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
823 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
824 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
825 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
826 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
827 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
828 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
829 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
830 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
831
841
842 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
864 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
865 };
866
867 if (!Subtarget->hasFullFP16()) {
868 LegalizeNarrowFP(MVT::f16);
869 }
870 LegalizeNarrowFP(MVT::bf16);
873
874 // AArch64 has implementations of a lot of rounding-like FP operations.
875 // clang-format off
876 for (auto Op :
888 for (MVT Ty : {MVT::f32, MVT::f64})
890 if (Subtarget->hasFullFP16())
891 setOperationAction(Op, MVT::f16, Legal);
892 }
893 // clang-format on
894
895 // Basic strict FP operations are legal
898 for (MVT Ty : {MVT::f32, MVT::f64})
900 if (Subtarget->hasFullFP16())
901 setOperationAction(Op, MVT::f16, Legal);
902 }
903
905
911
913 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
916 } else {
919 }
922
923 // Generate outline atomics library calls only if LSE was not specified for
924 // subtarget
925 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
951#define LCALLNAMES(A, B, N) \
952 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
953 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
954 setLibcallName(A##N##_REL, #B #N "_rel"); \
955 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
956#define LCALLNAME4(A, B) \
957 LCALLNAMES(A, B, 1) \
958 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
959#define LCALLNAME5(A, B) \
960 LCALLNAMES(A, B, 1) \
961 LCALLNAMES(A, B, 2) \
962 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
963 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
964 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
965 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
966 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
967 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
968 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
969#undef LCALLNAMES
970#undef LCALLNAME4
971#undef LCALLNAME5
972 }
973
974 if (Subtarget->hasLSE128()) {
975 // Custom lowering because i128 is not legal. Must be replaced by 2x64
976 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
980 }
981
982 // 128-bit loads and stores can be done without expanding
985
986 // Aligned 128-bit loads and stores are single-copy atomic according to the
987 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
988 if (Subtarget->hasLSE2()) {
991 }
992
993 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
994 // custom lowering, as there are no un-paired non-temporal stores and
995 // legalization will break up 256 bit inputs.
997 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
998 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
999 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1000 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1001 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1002 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1003 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1004
1005 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1006 // custom lowering, as there are no un-paired non-temporal loads legalization
1007 // will break up 256 bit inputs.
1008 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1009 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1010 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1011 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1012 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1013 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1014 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1015 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1016
1017 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1019
1020 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1021 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1022 // Issue __sincos_stret if available.
1025 } else {
1028 }
1029
1030 // Make floating-point constants legal for the large code model, so they don't
1031 // become loads from the constant pool.
1032 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1035 }
1036
1037 // AArch64 does not have floating-point extending loads, i1 sign-extending
1038 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1039 for (MVT VT : MVT::fp_valuetypes()) {
1040 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1041 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1042 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1043 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1044 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1045 }
1046 for (MVT VT : MVT::integer_valuetypes())
1047 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1048
1049 for (MVT WideVT : MVT::fp_valuetypes()) {
1050 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1051 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1052 setTruncStoreAction(WideVT, NarrowVT, Expand);
1053 }
1054 }
1055 }
1056
1057 if (Subtarget->hasFPARMv8()) {
1061 }
1062
1063 // Indexed loads and stores are supported.
1064 for (unsigned im = (unsigned)ISD::PRE_INC;
1066 setIndexedLoadAction(im, MVT::i8, Legal);
1067 setIndexedLoadAction(im, MVT::i16, Legal);
1068 setIndexedLoadAction(im, MVT::i32, Legal);
1069 setIndexedLoadAction(im, MVT::i64, Legal);
1070 setIndexedLoadAction(im, MVT::f64, Legal);
1071 setIndexedLoadAction(im, MVT::f32, Legal);
1072 setIndexedLoadAction(im, MVT::f16, Legal);
1073 setIndexedLoadAction(im, MVT::bf16, Legal);
1074 setIndexedStoreAction(im, MVT::i8, Legal);
1075 setIndexedStoreAction(im, MVT::i16, Legal);
1076 setIndexedStoreAction(im, MVT::i32, Legal);
1077 setIndexedStoreAction(im, MVT::i64, Legal);
1078 setIndexedStoreAction(im, MVT::f64, Legal);
1079 setIndexedStoreAction(im, MVT::f32, Legal);
1080 setIndexedStoreAction(im, MVT::f16, Legal);
1081 setIndexedStoreAction(im, MVT::bf16, Legal);
1082 }
1083
1084 // Trap.
1085 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1088
1089 // We combine OR nodes for bitfield operations.
1091 // Try to create BICs for vector ANDs.
1093
1094 // llvm.init.trampoline and llvm.adjust.trampoline
1097
1098 // Vector add and sub nodes may conceal a high-half opportunity.
1099 // Also, try to fold ADD into CSINC/CSINV..
1102
1105
1106 // Try and combine setcc with csel
1108
1110
1117
1119
1121
1123
1127
1130
1132
1134
1136
1140
1142
1143 // In case of strict alignment, avoid an excessive number of byte wide stores.
1146 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1147
1151 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1152
1155 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1156
1159 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1160
1162
1164
1165 EnableExtLdPromotion = true;
1166
1167 // Set required alignment.
1169 // Set preferred alignments.
1170
1171 // Don't align loops on Windows. The SEH unwind info generation needs to
1172 // know the exact length of functions before the alignments have been
1173 // expanded.
1174 if (!Subtarget->isTargetWindows())
1178
1179 // Only change the limit for entries in a jump table if specified by
1180 // the sub target, but not at the command line.
1181 unsigned MaxJT = STI.getMaximumJumpTableSize();
1182 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1184
1186
1188
1190 if (Subtarget->hasSME())
1192
1193 if (Subtarget->isNeonAvailable()) {
1194 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1195 // silliness like this:
1196 // clang-format off
1197 for (auto Op :
1218 setOperationAction(Op, MVT::v1f64, Expand);
1219 // clang-format on
1220
1221 for (auto Op :
1226 setOperationAction(Op, MVT::v1i64, Expand);
1227
1228 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1229 // elements smaller than i32, so promote the input to i32 first.
1230 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1231 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1232
1233 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1234 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1235 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1238 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1240
1241 if (Subtarget->hasFullFP16()) {
1244
1253 } else {
1254 // when AArch64 doesn't have fullfp16 support, promote the input
1255 // to i32 first.
1256 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1257 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1258 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1259 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1260 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1261 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1262 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1263 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1264 }
1265
1266 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1267 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1274 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1279 }
1280
1281 // Custom handling for some quad-vector types to detect MULL.
1282 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1283 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1284 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1285 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1286 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1287 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1288
1289 // Saturates
1290 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1291 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1296 }
1297
1298 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1299 MVT::v4i32}) {
1306 }
1307
1308 // Vector reductions
1309 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1310 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1311 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1316
1318 }
1319 }
1320 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1321 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1330 }
1335
1337 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1338 // Likewise, narrowing and extending vector loads/stores aren't handled
1339 // directly.
1342
1343 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1346 } else {
1349 }
1352
1355
1356 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1357 setTruncStoreAction(VT, InnerVT, Expand);
1358 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1359 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1360 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1361 }
1362 }
1363
1364 for (auto Op :
1370 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1372 if (Subtarget->hasFullFP16())
1373 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1375 }
1376
1377 // LRINT and LLRINT.
1378 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1379 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1381 if (Subtarget->hasFullFP16())
1382 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1384 }
1385
1386 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1387
1392
1396
1397 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1398 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1399 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1400 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1401 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1402 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1403
1404 // ADDP custom lowering
1405 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1407 // FADDP custom lowering
1408 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1410 } else /* !isNeonAvailable */ {
1412 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1414
1415 if (VT.is128BitVector() || VT.is64BitVector()) {
1419 Subtarget->isLittleEndian() ? Legal : Expand);
1420 }
1421 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1422 setTruncStoreAction(VT, InnerVT, Expand);
1423 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1424 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1425 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1426 }
1427 }
1428 }
1429
1430 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1434 }
1435
1436 if (Subtarget->hasSME()) {
1438 }
1439
1440 // FIXME: Move lowering for more nodes here if those are common between
1441 // SVE and SME.
1442 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1443 for (auto VT :
1444 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1449 }
1450 }
1451
1452 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1453 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1494
1500
1509
1514
1515 if (!Subtarget->isLittleEndian())
1517
1518 if (Subtarget->hasSVE2() ||
1519 (Subtarget->hasSME() && Subtarget->isStreaming()))
1520 // For SLI/SRI.
1522 }
1523
1524 // Illegal unpacked integer vector types.
1525 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1528 }
1529
1530 // Type legalize unpacked bitcasts.
1531 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1533
1534 for (auto VT :
1535 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1536 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1538
1539 for (auto VT :
1540 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1548
1552
1553 // There are no legal MVT::nxv16f## based types.
1554 if (VT != MVT::nxv16i1) {
1557 }
1558 }
1559
1560 // NEON doesn't support masked loads/stores, but SME and SVE do.
1561 for (auto VT :
1562 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1563 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1564 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1567 }
1568
1569 // Firstly, exclude all scalable vector extending loads/truncating stores,
1570 // include both integer and floating scalable vector.
1572 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1573 setTruncStoreAction(VT, InnerVT, Expand);
1574 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1575 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1576 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1577 }
1578 }
1579
1580 // Then, selectively enable those which we directly support.
1581 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1582 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1583 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1584 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1585 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1586 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1587 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1588 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1589 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1590 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1591 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1592 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1593 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1594 }
1595
1596 // SVE supports truncating stores of 64 and 128-bit vectors
1597 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1598 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1599 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1600 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1601 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1602
1603 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1604 MVT::nxv4f32, MVT::nxv2f64}) {
1644
1666
1678 }
1679
1680 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1691
1692 if (Subtarget->hasSVEB16B16()) {
1701 }
1702 }
1703
1704 for (auto Opcode :
1707 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1708 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1709 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1710 }
1711
1712 if (!Subtarget->hasSVEB16B16()) {
1713 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1715 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1716 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1717 setOperationAction(Opcode, MVT::nxv8bf16, Expand);
1718 }
1719 }
1720
1723
1724 // NEON doesn't support integer divides, but SVE does
1725 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1726 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1729 }
1730
1731 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1732 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1733 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1734
1735 // NOTE: Currently this has to happen after computeRegisterProperties rather
1736 // than the preferred option of combining it with the addRegisterClass call.
1737 if (Subtarget->useSVEForFixedLengthVectors()) {
1740 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1741 addTypeForFixedLengthSVE(VT);
1742 }
1745 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1746 addTypeForFixedLengthSVE(VT);
1747 }
1748
1749 // 64bit results can mean a bigger than NEON input.
1750 for (auto VT : {MVT::v8i8, MVT::v4i16})
1753
1754 // 128bit results imply a bigger than NEON input.
1755 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1757 for (auto VT : {MVT::v8f16, MVT::v4f32})
1759
1760 // These operations are not supported on NEON but SVE can do them.
1762 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1763 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1764 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1765 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1766 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1767 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1768 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1769 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1770 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1771 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1772 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1773 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1774 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1775 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1776 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1781
1782 // Int operations with no NEON support.
1783 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1784 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1792 }
1793
1794 // Use SVE for vectors with more than 2 elements.
1795 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1797 }
1798
1799 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1800 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1801 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1802 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1803
1805
1806 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1808 }
1809
1810 // Handle operations that are only available in non-streaming SVE mode.
1811 if (Subtarget->isSVEAvailable()) {
1812 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1813 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1814 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1815 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1816 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1817 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1818 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1821 }
1822
1823 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1824 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1825 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1827
1828 // We can lower types that have <vscale x {2|4}> elements to compact.
1829 for (auto VT :
1830 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1831 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1833
1834 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1835 // NEON vectors in the lowest bits of the SVE register.
1836 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1837 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1839
1840 // Histcnt is SVE2 only
1841 if (Subtarget->hasSVE2()) {
1843 Custom);
1845 Custom);
1846 }
1847 }
1848
1849
1850 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1851 // Only required for llvm.aarch64.mops.memset.tag
1853 }
1854
1856
1857 if (Subtarget->hasSVE()) {
1862 }
1863
1864 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1865
1866 IsStrictFPEnabled = true;
1868
1869 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1870 // it, but it's just a wrapper around ldexp.
1871 if (Subtarget->isTargetWindows()) {
1873 if (isOperationExpand(Op, MVT::f32))
1874 setOperationAction(Op, MVT::f32, Promote);
1875 }
1876
1877 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1878 // isn't legal.
1880 if (isOperationExpand(Op, MVT::f16))
1881 setOperationAction(Op, MVT::f16, Promote);
1882
1883 if (Subtarget->isWindowsArm64EC()) {
1884 // FIXME: are there intrinsics we need to exclude from this?
1885 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1886 auto code = static_cast<RTLIB::Libcall>(i);
1887 auto libcallName = getLibcallName(code);
1888 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1889 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1890 }
1891 }
1892 }
1893}
1894
1895void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1896 assert(VT.isVector() && "VT should be a vector type");
1897
1898 if (VT.isFloatingPoint()) {
1900 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1901 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1902 }
1903
1904 // Mark vector float intrinsics as expand.
1905 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1923 }
1924
1925 // But we do support custom-lowering for FCOPYSIGN.
1926 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1927 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1928 VT == MVT::v8f16) &&
1929 Subtarget->hasFullFP16()))
1931
1944
1948 for (MVT InnerVT : MVT::all_valuetypes())
1949 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1950
1951 // CNT supports only B element sizes, then use UADDLP to widen.
1952 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1954
1960
1961 for (unsigned Opcode :
1964 setOperationAction(Opcode, VT, Custom);
1965
1966 if (!VT.isFloatingPoint())
1968
1969 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1970 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1971 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1972 setOperationAction(Opcode, VT, Legal);
1973
1974 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1975 // NEON types.
1976 if (VT.isFloatingPoint() &&
1977 VT.getVectorElementType() != MVT::bf16 &&
1978 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1979 for (unsigned Opcode :
1985 setOperationAction(Opcode, VT, Legal);
1986
1987 // Strict fp extend and trunc are legal
1988 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1990 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1992
1993 // FIXME: We could potentially make use of the vector comparison instructions
1994 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1995 // complications:
1996 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1997 // so we would need to expand when the condition code doesn't match the
1998 // kind of comparison.
1999 // * Some kinds of comparison require more than one FCMXY instruction so
2000 // would need to be expanded instead.
2001 // * The lowering of the non-strict versions involves target-specific ISD
2002 // nodes so we would likely need to add strict versions of all of them and
2003 // handle them appropriately.
2006
2007 if (Subtarget->isLittleEndian()) {
2008 for (unsigned im = (unsigned)ISD::PRE_INC;
2012 }
2013 }
2014
2015 if (Subtarget->hasD128()) {
2018 }
2019}
2020
2022 EVT OpVT) const {
2023 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2024 if (!Subtarget->hasSVE())
2025 return true;
2026
2027 // We can only support legal predicate result types. We can use the SVE
2028 // whilelo instruction for generating fixed-width predicates too.
2029 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
2030 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
2031 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
2032 return true;
2033
2034 // The whilelo instruction only works with i32 or i64 scalar inputs.
2035 if (OpVT != MVT::i32 && OpVT != MVT::i64)
2036 return true;
2037
2038 return false;
2039}
2040
2042 const IntrinsicInst *I) const {
2043 if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
2044 return true;
2045
2046 EVT VT = EVT::getEVT(I->getType());
2047 auto Op1 = I->getOperand(1);
2048 EVT Op1VT = EVT::getEVT(Op1->getType());
2049 if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
2050 (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
2051 VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
2052 return false;
2053 return true;
2054}
2055
2057 if (!Subtarget->isSVEorStreamingSVEAvailable())
2058 return true;
2059
2060 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2061 // also support fixed-width predicates.
2062 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2063 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2064 VT != MVT::v4i1 && VT != MVT::v2i1;
2065}
2066
2068 unsigned SearchSize) const {
2069 // MATCH is SVE2 and only available in non-streaming mode.
2070 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2071 return true;
2072 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2073 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2074 return SearchSize != 8;
2075 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2076 return SearchSize != 8 && SearchSize != 16;
2077 return true;
2078}
2079
2080void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2081 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2082
2083 // By default everything must be expanded.
2084 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2086
2087 if (VT.isFloatingPoint()) {
2097 }
2098
2100 VT == MVT::v1f64 ? Expand : Custom;
2101
2102 // Mark integer truncating stores/extending loads as having custom lowering
2103 if (VT.isInteger()) {
2104 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2105 while (InnerVT != VT) {
2106 setTruncStoreAction(VT, InnerVT, Default);
2107 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2108 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2109 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2110 InnerVT = InnerVT.changeVectorElementType(
2111 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2112 }
2113 }
2114
2115 // Mark floating-point truncating stores/extending loads as having custom
2116 // lowering
2117 if (VT.isFloatingPoint()) {
2118 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2119 while (InnerVT != VT) {
2120 setTruncStoreAction(VT, InnerVT, Custom);
2121 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2122 InnerVT = InnerVT.changeVectorElementType(
2124 }
2125 }
2126
2127 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2128 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2129
2130 // Lower fixed length vector operations to scalable equivalents.
2137 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2174 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2175 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2177 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2196 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2222}
2223
2224void AArch64TargetLowering::addDRType(MVT VT) {
2225 addRegisterClass(VT, &AArch64::FPR64RegClass);
2226 if (Subtarget->isNeonAvailable())
2227 addTypeForNEON(VT);
2228}
2229
2230void AArch64TargetLowering::addQRType(MVT VT) {
2231 addRegisterClass(VT, &AArch64::FPR128RegClass);
2232 if (Subtarget->isNeonAvailable())
2233 addTypeForNEON(VT);
2234}
2235
2237 LLVMContext &C, EVT VT) const {
2238 if (!VT.isVector())
2239 return MVT::i32;
2240 if (VT.isScalableVector())
2241 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2243}
2244
2245// isIntImmediate - This method tests to see if the node is a constant
2246// operand. If so Imm will receive the value.
2247static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2248 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2249 Imm = C->getZExtValue();
2250 return true;
2251 }
2252 return false;
2253}
2254
2255// isOpcWithIntImmediate - This method tests to see if the node is a specific
2256// opcode and that it has a immediate integer right operand.
2257// If so Imm will receive the value.
2258static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2259 uint64_t &Imm) {
2260 return N->getOpcode() == Opc &&
2261 isIntImmediate(N->getOperand(1).getNode(), Imm);
2262}
2263
2264static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2265 const APInt &Demanded,
2267 unsigned NewOpc) {
2268 uint64_t OldImm = Imm, NewImm, Enc;
2269 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2270
2271 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2272 // bimm64.
2273 if (Imm == 0 || Imm == Mask ||
2275 return false;
2276
2277 unsigned EltSize = Size;
2278 uint64_t DemandedBits = Demanded.getZExtValue();
2279
2280 // Clear bits that are not demanded.
2281 Imm &= DemandedBits;
2282
2283 while (true) {
2284 // The goal here is to set the non-demanded bits in a way that minimizes
2285 // the number of switching between 0 and 1. In order to achieve this goal,
2286 // we set the non-demanded bits to the value of the preceding demanded bits.
2287 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2288 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2289 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2290 // The final result is 0b11000011.
2291 uint64_t NonDemandedBits = ~DemandedBits;
2292 uint64_t InvertedImm = ~Imm & DemandedBits;
2293 uint64_t RotatedImm =
2294 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2295 NonDemandedBits;
2296 uint64_t Sum = RotatedImm + NonDemandedBits;
2297 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2298 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2299 NewImm = (Imm | Ones) & Mask;
2300
2301 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2302 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2303 // we halve the element size and continue the search.
2304 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2305 break;
2306
2307 // We cannot shrink the element size any further if it is 2-bits.
2308 if (EltSize == 2)
2309 return false;
2310
2311 EltSize /= 2;
2312 Mask >>= EltSize;
2313 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2314
2315 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2316 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2317 return false;
2318
2319 // Merge the upper and lower halves of Imm and DemandedBits.
2320 Imm |= Hi;
2321 DemandedBits |= DemandedBitsHi;
2322 }
2323
2324 ++NumOptimizedImms;
2325
2326 // Replicate the element across the register width.
2327 while (EltSize < Size) {
2328 NewImm |= NewImm << EltSize;
2329 EltSize *= 2;
2330 }
2331
2332 (void)OldImm;
2333 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2334 "demanded bits should never be altered");
2335 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2336
2337 // Create the new constant immediate node.
2338 EVT VT = Op.getValueType();
2339 SDLoc DL(Op);
2340 SDValue New;
2341
2342 // If the new constant immediate is all-zeros or all-ones, let the target
2343 // independent DAG combine optimize this node.
2344 if (NewImm == 0 || NewImm == OrigMask) {
2345 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2346 TLO.DAG.getConstant(NewImm, DL, VT));
2347 // Otherwise, create a machine node so that target independent DAG combine
2348 // doesn't undo this optimization.
2349 } else {
2351 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2352 New = SDValue(
2353 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2354 }
2355
2356 return TLO.CombineTo(Op, New);
2357}
2358
2360 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2361 TargetLoweringOpt &TLO) const {
2362 // Delay this optimization to as late as possible.
2363 if (!TLO.LegalOps)
2364 return false;
2365
2367 return false;
2368
2369 EVT VT = Op.getValueType();
2370 if (VT.isVector())
2371 return false;
2372
2373 unsigned Size = VT.getSizeInBits();
2374
2375 if (Size != 32 && Size != 64)
2376 return false;
2377
2378 // Exit early if we demand all bits.
2379 if (DemandedBits.popcount() == Size)
2380 return false;
2381
2382 unsigned NewOpc;
2383 switch (Op.getOpcode()) {
2384 default:
2385 return false;
2386 case ISD::AND:
2387 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2388 break;
2389 case ISD::OR:
2390 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2391 break;
2392 case ISD::XOR:
2393 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2394 break;
2395 }
2396 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2397 if (!C)
2398 return false;
2399 uint64_t Imm = C->getZExtValue();
2400 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2401}
2402
2403/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2404/// Mask are known to be either zero or one and return them Known.
2406 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2407 const SelectionDAG &DAG, unsigned Depth) const {
2408 switch (Op.getOpcode()) {
2409 default:
2410 break;
2411 case AArch64ISD::DUP: {
2412 SDValue SrcOp = Op.getOperand(0);
2413 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2414 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2415 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2416 "Expected DUP implicit truncation");
2417 Known = Known.trunc(Op.getScalarValueSizeInBits());
2418 }
2419 break;
2420 }
2421 case AArch64ISD::CSEL: {
2422 KnownBits Known2;
2423 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2424 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2425 Known = Known.intersectWith(Known2);
2426 break;
2427 }
2428 case AArch64ISD::BICi: {
2429 // Compute the bit cleared value.
2430 APInt Mask =
2431 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2432 .trunc(Known.getBitWidth());
2433 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2434 Known &= KnownBits::makeConstant(Mask);
2435 break;
2436 }
2437 case AArch64ISD::VLSHR: {
2438 KnownBits Known2;
2439 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2440 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2441 Known = KnownBits::lshr(Known, Known2);
2442 break;
2443 }
2444 case AArch64ISD::VASHR: {
2445 KnownBits Known2;
2446 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2447 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2448 Known = KnownBits::ashr(Known, Known2);
2449 break;
2450 }
2451 case AArch64ISD::VSHL: {
2452 KnownBits Known2;
2453 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2454 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2455 Known = KnownBits::shl(Known, Known2);
2456 break;
2457 }
2458 case AArch64ISD::MOVI: {
2460 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2461 break;
2462 }
2464 case AArch64ISD::ADDlow: {
2465 if (!Subtarget->isTargetILP32())
2466 break;
2467 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2468 Known.Zero = APInt::getHighBitsSet(64, 32);
2469 break;
2470 }
2472 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2473 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2474 break;
2475 }
2477 Intrinsic::ID IntID =
2478 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2479 switch (IntID) {
2480 default: return;
2481 case Intrinsic::aarch64_ldaxr:
2482 case Intrinsic::aarch64_ldxr: {
2483 unsigned BitWidth = Known.getBitWidth();
2484 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2485 unsigned MemBits = VT.getScalarSizeInBits();
2486 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2487 return;
2488 }
2489 }
2490 break;
2491 }
2493 case ISD::INTRINSIC_VOID: {
2494 unsigned IntNo = Op.getConstantOperandVal(0);
2495 switch (IntNo) {
2496 default:
2497 break;
2498 case Intrinsic::aarch64_neon_uaddlv: {
2499 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2500 unsigned BitWidth = Known.getBitWidth();
2501 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2502 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2503 assert(BitWidth >= Bound && "Unexpected width!");
2505 Known.Zero |= Mask;
2506 }
2507 break;
2508 }
2509 case Intrinsic::aarch64_neon_umaxv:
2510 case Intrinsic::aarch64_neon_uminv: {
2511 // Figure out the datatype of the vector operand. The UMINV instruction
2512 // will zero extend the result, so we can mark as known zero all the
2513 // bits larger than the element datatype. 32-bit or larget doesn't need
2514 // this as those are legal types and will be handled by isel directly.
2515 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2516 unsigned BitWidth = Known.getBitWidth();
2517 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2518 assert(BitWidth >= 8 && "Unexpected width!");
2520 Known.Zero |= Mask;
2521 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2522 assert(BitWidth >= 16 && "Unexpected width!");
2524 Known.Zero |= Mask;
2525 }
2526 break;
2527 } break;
2528 }
2529 }
2530 }
2531}
2532
2534 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2535 unsigned Depth) const {
2536 EVT VT = Op.getValueType();
2537 unsigned VTBits = VT.getScalarSizeInBits();
2538 unsigned Opcode = Op.getOpcode();
2539 switch (Opcode) {
2540 case AArch64ISD::CMEQ:
2541 case AArch64ISD::CMGE:
2542 case AArch64ISD::CMGT:
2543 case AArch64ISD::CMHI:
2544 case AArch64ISD::CMHS:
2545 case AArch64ISD::FCMEQ:
2546 case AArch64ISD::FCMGE:
2547 case AArch64ISD::FCMGT:
2548 case AArch64ISD::CMEQz:
2549 case AArch64ISD::CMGEz:
2550 case AArch64ISD::CMGTz:
2551 case AArch64ISD::CMLEz:
2552 case AArch64ISD::CMLTz:
2553 case AArch64ISD::FCMEQz:
2554 case AArch64ISD::FCMGEz:
2555 case AArch64ISD::FCMGTz:
2556 case AArch64ISD::FCMLEz:
2557 case AArch64ISD::FCMLTz:
2558 // Compares return either 0 or all-ones
2559 return VTBits;
2560 case AArch64ISD::VASHR: {
2561 unsigned Tmp =
2562 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2563 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2564 }
2565 }
2566
2567 return 1;
2568}
2569
2571 EVT) const {
2572 return MVT::i64;
2573}
2574
2576 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2577 unsigned *Fast) const {
2578
2579 // Allow SVE loads/stores where the alignment >= the size of the element type,
2580 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2581 // for stores that come from IR, only require element-size alignment (even if
2582 // unaligned accesses are disabled). Without this, these will be forced to
2583 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2584 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2585 if (VT.isScalableVector()) {
2586 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2587 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2588 return true;
2589 }
2590
2591 if (Subtarget->requiresStrictAlign())
2592 return false;
2593
2594 if (Fast) {
2595 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2596 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2597 // See comments in performSTORECombine() for more details about
2598 // these conditions.
2599
2600 // Code that uses clang vector extensions can mark that it
2601 // wants unaligned accesses to be treated as fast by
2602 // underspecifying alignment to be 1 or 2.
2603 Alignment <= 2 ||
2604
2605 // Disregard v2i64. Memcpy lowering produces those and splitting
2606 // them regresses performance on micro-benchmarks and olden/bh.
2607 VT == MVT::v2i64;
2608 }
2609 return true;
2610}
2611
2612// Same as above but handling LLTs instead.
2614 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2615 unsigned *Fast) const {
2616 if (Subtarget->requiresStrictAlign())
2617 return false;
2618
2619 if (Fast) {
2620 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2621 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2622 Ty.getSizeInBytes() != 16 ||
2623 // See comments in performSTORECombine() for more details about
2624 // these conditions.
2625
2626 // Code that uses clang vector extensions can mark that it
2627 // wants unaligned accesses to be treated as fast by
2628 // underspecifying alignment to be 1 or 2.
2629 Alignment <= 2 ||
2630
2631 // Disregard v2i64. Memcpy lowering produces those and splitting
2632 // them regresses performance on micro-benchmarks and olden/bh.
2633 Ty == LLT::fixed_vector(2, 64);
2634 }
2635 return true;
2636}
2637
2638FastISel *
2640 const TargetLibraryInfo *libInfo) const {
2641 return AArch64::createFastISel(funcInfo, libInfo);
2642}
2643
2644const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2645#define MAKE_CASE(V) \
2646 case V: \
2647 return #V;
2648 switch ((AArch64ISD::NodeType)Opcode) {
2650 break;
2979 }
2980#undef MAKE_CASE
2981 return nullptr;
2982}
2983
2986 MachineBasicBlock *MBB) const {
2987 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2988 // phi node:
2989
2990 // OrigBB:
2991 // [... previous instrs leading to comparison ...]
2992 // b.ne TrueBB
2993 // b EndBB
2994 // TrueBB:
2995 // ; Fallthrough
2996 // EndBB:
2997 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2998
2999 MachineFunction *MF = MBB->getParent();
3000 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3001 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3002 DebugLoc DL = MI.getDebugLoc();
3004
3005 Register DestReg = MI.getOperand(0).getReg();
3006 Register IfTrueReg = MI.getOperand(1).getReg();
3007 Register IfFalseReg = MI.getOperand(2).getReg();
3008 unsigned CondCode = MI.getOperand(3).getImm();
3009 bool NZCVKilled = MI.getOperand(4).isKill();
3010
3011 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3012 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3013 MF->insert(It, TrueBB);
3014 MF->insert(It, EndBB);
3015
3016 // Transfer rest of current basic-block to EndBB
3017 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3018 MBB->end());
3020
3021 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3022 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3023 MBB->addSuccessor(TrueBB);
3024 MBB->addSuccessor(EndBB);
3025
3026 // TrueBB falls through to the end.
3027 TrueBB->addSuccessor(EndBB);
3028
3029 if (!NZCVKilled) {
3030 TrueBB->addLiveIn(AArch64::NZCV);
3031 EndBB->addLiveIn(AArch64::NZCV);
3032 }
3033
3034 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3035 .addReg(IfTrueReg)
3036 .addMBB(TrueBB)
3037 .addReg(IfFalseReg)
3038 .addMBB(MBB);
3039
3040 MI.eraseFromParent();
3041 return EndBB;
3042}
3043
3045 MachineInstr &MI, MachineBasicBlock *BB) const {
3047 BB->getParent()->getFunction().getPersonalityFn())) &&
3048 "SEH does not use catchret!");
3049 return BB;
3050}
3051
3054 MachineBasicBlock *MBB) const {
3055 MachineFunction &MF = *MBB->getParent();
3056 MachineBasicBlock::iterator MBBI = MI.getIterator();
3058 const AArch64InstrInfo &TII =
3059 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3060 Register TargetReg = MI.getOperand(0).getReg();
3062 TII.probedStackAlloc(MBBI, TargetReg, false);
3063
3064 MI.eraseFromParent();
3065 return NextInst->getParent();
3066}
3067
3069AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3071 MachineBasicBlock *BB) const {
3072 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3073 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3074
3075 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3076 MIB.add(MI.getOperand(1)); // slice index register
3077 MIB.add(MI.getOperand(2)); // slice index offset
3078 MIB.add(MI.getOperand(3)); // pg
3079 MIB.add(MI.getOperand(4)); // base
3080 MIB.add(MI.getOperand(5)); // offset
3081
3082 MI.eraseFromParent(); // The pseudo is gone now.
3083 return BB;
3084}
3085
3088 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3090 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3091
3092 MIB.addReg(AArch64::ZA, RegState::Define);
3093 MIB.add(MI.getOperand(0)); // Vector select register
3094 MIB.add(MI.getOperand(1)); // Vector select offset
3095 MIB.add(MI.getOperand(2)); // Base
3096 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3097
3098 MI.eraseFromParent(); // The pseudo is gone now.
3099 return BB;
3100}
3101
3104 unsigned Opcode,
3105 bool Op0IsDef) const {
3106 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3108
3109 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3110 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3111 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3112 MIB.add(MI.getOperand(I));
3113
3114 MI.eraseFromParent(); // The pseudo is gone now.
3115 return BB;
3116}
3117
3119AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3121 MachineBasicBlock *BB) const {
3122 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3123 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3124 unsigned StartIdx = 0;
3125
3126 bool HasTile = BaseReg != AArch64::ZA;
3127 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3128 if (HasZPROut) {
3129 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3130 ++StartIdx;
3131 }
3132 if (HasTile) {
3133 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3134 RegState::Define); // Output ZA Tile
3135 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3136 StartIdx++;
3137 } else {
3138 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3139 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3140 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3141 ++StartIdx;
3142 }
3143 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3144 }
3145 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3146 MIB.add(MI.getOperand(I));
3147
3148 MI.eraseFromParent(); // The pseudo is gone now.
3149 return BB;
3150}
3151
3154 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3156 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3157 MIB.add(MI.getOperand(0)); // Mask
3158
3159 unsigned Mask = MI.getOperand(0).getImm();
3160 for (unsigned I = 0; I < 8; I++) {
3161 if (Mask & (1 << I))
3162 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3163 }
3164
3165 MI.eraseFromParent(); // The pseudo is gone now.
3166 return BB;
3167}
3168
3171 MachineBasicBlock *BB) const {
3172 MachineFunction *MF = BB->getParent();
3173 MachineFrameInfo &MFI = MF->getFrameInfo();
3175 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3176 if (TPIDR2.Uses > 0) {
3177 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3178 // Store the buffer pointer to the TPIDR2 stack object.
3179 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3180 .addReg(MI.getOperand(0).getReg())
3181 .addFrameIndex(TPIDR2.FrameIndex)
3182 .addImm(0);
3183 // Set the reserved bytes (10-15) to zero
3184 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3185 .addReg(AArch64::WZR)
3186 .addFrameIndex(TPIDR2.FrameIndex)
3187 .addImm(5);
3188 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3189 .addReg(AArch64::WZR)
3190 .addFrameIndex(TPIDR2.FrameIndex)
3191 .addImm(3);
3192 } else
3193 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3194
3195 BB->remove_instr(&MI);
3196 return BB;
3197}
3198
3201 MachineBasicBlock *BB) const {
3202 MachineFunction *MF = BB->getParent();
3203 MachineFrameInfo &MFI = MF->getFrameInfo();
3205 // TODO This function grows the stack with a subtraction, which doesn't work
3206 // on Windows. Some refactoring to share the functionality in
3207 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3208 // supports SME
3210 "Lazy ZA save is not yet supported on Windows");
3211
3212 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3213
3214 if (TPIDR2.Uses > 0) {
3215 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3217
3218 // The SUBXrs below won't always be emitted in a form that accepts SP
3219 // directly
3220 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3221 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3222 .addReg(AArch64::SP);
3223
3224 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3225 auto Size = MI.getOperand(1).getReg();
3226 auto Dest = MI.getOperand(0).getReg();
3227 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3228 .addReg(Size)
3229 .addReg(Size)
3230 .addReg(SP);
3231 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3232 AArch64::SP)
3233 .addReg(Dest);
3234
3235 // We have just allocated a variable sized object, tell this to PEI.
3236 MFI.CreateVariableSizedObject(Align(16), nullptr);
3237 }
3238
3239 BB->remove_instr(&MI);
3240 return BB;
3241}
3242
3243// TODO: Find a way to merge this with EmitAllocateZABuffer.
3246 MachineBasicBlock *BB) const {
3247 MachineFunction *MF = BB->getParent();
3248 MachineFrameInfo &MFI = MF->getFrameInfo();
3251 "Lazy ZA save is not yet supported on Windows");
3252
3253 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3254 if (FuncInfo->isSMESaveBufferUsed()) {
3255 // Allocate a buffer object of the size given by MI.getOperand(1).
3256 auto Size = MI.getOperand(1).getReg();
3257 auto Dest = MI.getOperand(0).getReg();
3258 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3259 .addReg(AArch64::SP)
3260 .addReg(Size)
3262 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3263 .addReg(AArch64::SP);
3264
3265 // We have just allocated a variable sized object, tell this to PEI.
3266 MFI.CreateVariableSizedObject(Align(16), nullptr);
3267 } else
3268 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3269 MI.getOperand(0).getReg());
3270
3271 BB->remove_instr(&MI);
3272 return BB;
3273}
3274
3277 MachineBasicBlock *BB) const {
3278 // If the buffer is used, emit a call to __arm_sme_state_size()
3279 MachineFunction *MF = BB->getParent();
3281 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3282 if (FuncInfo->isSMESaveBufferUsed()) {
3283 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3284 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3285 .addExternalSymbol("__arm_sme_state_size")
3286 .addReg(AArch64::X0, RegState::ImplicitDefine)
3287 .addRegMask(TRI->getCallPreservedMask(
3288 *MF, CallingConv::
3290 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3291 MI.getOperand(0).getReg())
3292 .addReg(AArch64::X0);
3293 } else
3294 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3295 MI.getOperand(0).getReg())
3296 .addReg(AArch64::XZR);
3297 BB->remove_instr(&MI);
3298 return BB;
3299}
3300
3302 MachineInstr &MI, MachineBasicBlock *BB) const {
3303
3304 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3305 if (SMEOrigInstr != -1) {
3306 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3307 uint64_t SMEMatrixType =
3308 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3309 switch (SMEMatrixType) {
3311 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3313 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3315 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3317 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3319 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3321 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3322 }
3323 }
3324
3325 switch (MI.getOpcode()) {
3326 default:
3327#ifndef NDEBUG
3328 MI.dump();
3329#endif
3330 llvm_unreachable("Unexpected instruction for custom inserter!");
3331 case AArch64::InitTPIDR2Obj:
3332 return EmitInitTPIDR2Object(MI, BB);
3333 case AArch64::AllocateZABuffer:
3334 return EmitAllocateZABuffer(MI, BB);
3335 case AArch64::AllocateSMESaveBuffer:
3336 return EmitAllocateSMESaveBuffer(MI, BB);
3337 case AArch64::GetSMESaveSize:
3338 return EmitGetSMESaveSize(MI, BB);
3339 case AArch64::F128CSEL:
3340 return EmitF128CSEL(MI, BB);
3341 case TargetOpcode::STATEPOINT:
3342 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3343 // while bl call instruction (where statepoint will be lowered at the end)
3344 // has implicit def. This def is early-clobber as it will be set at
3345 // the moment of the call and earlier than any use is read.
3346 // Add this implicit dead def here as a workaround.
3347 MI.addOperand(*MI.getMF(),
3349 AArch64::LR, /*isDef*/ true,
3350 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3351 /*isUndef*/ false, /*isEarlyClobber*/ true));
3352 [[fallthrough]];
3353 case TargetOpcode::STACKMAP:
3354 case TargetOpcode::PATCHPOINT:
3355 return emitPatchPoint(MI, BB);
3356
3357 case TargetOpcode::PATCHABLE_EVENT_CALL:
3358 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3359 return BB;
3360
3361 case AArch64::CATCHRET:
3362 return EmitLoweredCatchRet(MI, BB);
3363
3364 case AArch64::PROBED_STACKALLOC_DYN:
3365 return EmitDynamicProbedAlloc(MI, BB);
3366
3367 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3368 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3369 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3370 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3371 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3372 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3373 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3374 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3375 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3376 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3377 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3378 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3379 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3380 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3381 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3382 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3383 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3384 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3385 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3386 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3387 case AArch64::LDR_ZA_PSEUDO:
3388 return EmitFill(MI, BB);
3389 case AArch64::LDR_TX_PSEUDO:
3390 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3391 case AArch64::STR_TX_PSEUDO:
3392 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3393 case AArch64::ZERO_M_PSEUDO:
3394 return EmitZero(MI, BB);
3395 case AArch64::ZERO_T_PSEUDO:
3396 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3397 case AArch64::MOVT_TIZ_PSEUDO:
3398 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3399 }
3400}
3401
3402//===----------------------------------------------------------------------===//
3403// AArch64 Lowering private implementation.
3404//===----------------------------------------------------------------------===//
3405
3406//===----------------------------------------------------------------------===//
3407// Lowering Code
3408//===----------------------------------------------------------------------===//
3409
3410// Forward declarations of SVE fixed length lowering helpers
3415 SelectionDAG &DAG);
3418 EVT VT);
3419
3420/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3421static bool isZerosVector(const SDNode *N) {
3422 // Look through a bit convert.
3423 while (N->getOpcode() == ISD::BITCAST)
3424 N = N->getOperand(0).getNode();
3425
3427 return true;
3428
3429 if (N->getOpcode() != AArch64ISD::DUP)
3430 return false;
3431
3432 auto Opnd0 = N->getOperand(0);
3433 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3434}
3435
3436/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3437/// CC
3439 switch (CC) {
3440 default:
3441 llvm_unreachable("Unknown condition code!");
3442 case ISD::SETNE:
3443 return AArch64CC::NE;
3444 case ISD::SETEQ:
3445 return AArch64CC::EQ;
3446 case ISD::SETGT:
3447 return AArch64CC::GT;
3448 case ISD::SETGE:
3449 return AArch64CC::GE;
3450 case ISD::SETLT:
3451 return AArch64CC::LT;
3452 case ISD::SETLE:
3453 return AArch64CC::LE;
3454 case ISD::SETUGT:
3455 return AArch64CC::HI;
3456 case ISD::SETUGE:
3457 return AArch64CC::HS;
3458 case ISD::SETULT:
3459 return AArch64CC::LO;
3460 case ISD::SETULE:
3461 return AArch64CC::LS;
3462 }
3463}
3464
3465/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3467 AArch64CC::CondCode &CondCode,
3468 AArch64CC::CondCode &CondCode2) {
3469 CondCode2 = AArch64CC::AL;
3470 switch (CC) {
3471 default:
3472 llvm_unreachable("Unknown FP condition!");
3473 case ISD::SETEQ:
3474 case ISD::SETOEQ:
3475 CondCode = AArch64CC::EQ;
3476 break;
3477 case ISD::SETGT:
3478 case ISD::SETOGT:
3479 CondCode = AArch64CC::GT;
3480 break;
3481 case ISD::SETGE:
3482 case ISD::SETOGE:
3483 CondCode = AArch64CC::GE;
3484 break;
3485 case ISD::SETOLT:
3486 CondCode = AArch64CC::MI;
3487 break;
3488 case ISD::SETOLE:
3489 CondCode = AArch64CC::LS;
3490 break;
3491 case ISD::SETONE:
3492 CondCode = AArch64CC::MI;
3493 CondCode2 = AArch64CC::GT;
3494 break;
3495 case ISD::SETO:
3496 CondCode = AArch64CC::VC;
3497 break;
3498 case ISD::SETUO:
3499 CondCode = AArch64CC::VS;
3500 break;
3501 case ISD::SETUEQ:
3502 CondCode = AArch64CC::EQ;
3503 CondCode2 = AArch64CC::VS;
3504 break;
3505 case ISD::SETUGT:
3506 CondCode = AArch64CC::HI;
3507 break;
3508 case ISD::SETUGE:
3509 CondCode = AArch64CC::PL;
3510 break;
3511 case ISD::SETLT:
3512 case ISD::SETULT:
3513 CondCode = AArch64CC::LT;
3514 break;
3515 case ISD::SETLE:
3516 case ISD::SETULE:
3517 CondCode = AArch64CC::LE;
3518 break;
3519 case ISD::SETNE:
3520 case ISD::SETUNE:
3521 CondCode = AArch64CC::NE;
3522 break;
3523 }
3524}
3525
3526/// Convert a DAG fp condition code to an AArch64 CC.
3527/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3528/// should be AND'ed instead of OR'ed.
3530 AArch64CC::CondCode &CondCode,
3531 AArch64CC::CondCode &CondCode2) {
3532 CondCode2 = AArch64CC::AL;
3533 switch (CC) {
3534 default:
3535 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3536 assert(CondCode2 == AArch64CC::AL);
3537 break;
3538 case ISD::SETONE:
3539 // (a one b)
3540 // == ((a olt b) || (a ogt b))
3541 // == ((a ord b) && (a une b))
3542 CondCode = AArch64CC::VC;
3543 CondCode2 = AArch64CC::NE;
3544 break;
3545 case ISD::SETUEQ:
3546 // (a ueq b)
3547 // == ((a uno b) || (a oeq b))
3548 // == ((a ule b) && (a uge b))
3549 CondCode = AArch64CC::PL;
3550 CondCode2 = AArch64CC::LE;
3551 break;
3552 }
3553}
3554
3555/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3556/// CC usable with the vector instructions. Fewer operations are available
3557/// without a real NZCV register, so we have to use less efficient combinations
3558/// to get the same effect.
3560 AArch64CC::CondCode &CondCode,
3561 AArch64CC::CondCode &CondCode2,
3562 bool &Invert) {
3563 Invert = false;
3564 switch (CC) {
3565 default:
3566 // Mostly the scalar mappings work fine.
3567 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3568 break;
3569 case ISD::SETUO:
3570 Invert = true;
3571 [[fallthrough]];
3572 case ISD::SETO:
3573 CondCode = AArch64CC::MI;
3574 CondCode2 = AArch64CC::GE;
3575 break;
3576 case ISD::SETUEQ:
3577 case ISD::SETULT:
3578 case ISD::SETULE:
3579 case ISD::SETUGT:
3580 case ISD::SETUGE:
3581 // All of the compare-mask comparisons are ordered, but we can switch
3582 // between the two by a double inversion. E.g. ULE == !OGT.
3583 Invert = true;
3584 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3585 CondCode, CondCode2);
3586 break;
3587 }
3588}
3589
3591 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3592 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3593 LLVM_DEBUG(dbgs() << "Is imm " << C
3594 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3595 return IsLegal;
3596}
3597
3598static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3599 KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
3600 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3601}
3602
3603// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3604// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3605// can be set differently by this operation. It comes down to whether
3606// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3607// everything is fine. If not then the optimization is wrong. Thus general
3608// comparisons are only valid if op2 != 0.
3609//
3610// So, finally, the only LLVM-native comparisons that don't mention C or V
3611// are the ones that aren't unsigned comparisons. They're the only ones we can
3612// safely use CMN for in the absence of information about op2.
3614 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3615 (isIntEqualitySetCC(CC) ||
3616 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3617 (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
3618}
3619
3621 SelectionDAG &DAG, SDValue Chain,
3622 bool IsSignaling) {
3623 EVT VT = LHS.getValueType();
3624 assert(VT != MVT::f128);
3625
3626 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3627
3628 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3629 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3630 {Chain, LHS});
3631 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3632 {LHS.getValue(1), RHS});
3633 Chain = RHS.getValue(1);
3634 }
3635 unsigned Opcode =
3637 return DAG.getNode(Opcode, dl, {MVT::i32, MVT::Other}, {Chain, LHS, RHS});
3638}
3639
3641 const SDLoc &dl, SelectionDAG &DAG) {
3642 EVT VT = LHS.getValueType();
3643 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3644
3645 if (VT.isFloatingPoint()) {
3646 assert(VT != MVT::f128);
3647 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3648 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3649 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3650 }
3651 return DAG.getNode(AArch64ISD::FCMP, dl, MVT::i32, LHS, RHS);
3652 }
3653
3654 // The CMP instruction is just an alias for SUBS, and representing it as
3655 // SUBS means that it's possible to get CSE with subtract operations.
3656 // A later phase can perform the optimization of setting the destination
3657 // register to WZR/XZR if it ends up being unused.
3658 unsigned Opcode = AArch64ISD::SUBS;
3659
3660 if (isCMN(RHS, CC, DAG)) {
3661 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3662 Opcode = AArch64ISD::ADDS;
3663 RHS = RHS.getOperand(1);
3664 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3665 isIntEqualitySetCC(CC)) {
3666 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3667 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3668 Opcode = AArch64ISD::ADDS;
3669 LHS = LHS.getOperand(1);
3670 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3671 if (LHS.getOpcode() == ISD::AND) {
3672 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3673 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3674 // of the signed comparisons.
3675 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3676 DAG.getVTList(VT, MVT_CC),
3677 LHS.getOperand(0),
3678 LHS.getOperand(1));
3679 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3680 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3681 return ANDSNode.getValue(1);
3682 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3683 // Use result of ANDS
3684 return LHS.getValue(1);
3685 }
3686 }
3687
3688 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3689 .getValue(1);
3690}
3691
3692/// \defgroup AArch64CCMP CMP;CCMP matching
3693///
3694/// These functions deal with the formation of CMP;CCMP;... sequences.
3695/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3696/// a comparison. They set the NZCV flags to a predefined value if their
3697/// predicate is false. This allows to express arbitrary conjunctions, for
3698/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3699/// expressed as:
3700/// cmp A
3701/// ccmp B, inv(CB), CA
3702/// check for CB flags
3703///
3704/// This naturally lets us implement chains of AND operations with SETCC
3705/// operands. And we can even implement some other situations by transforming
3706/// them:
3707/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3708/// negating the flags used in a CCMP/FCCMP operations.
3709/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3710/// by negating the flags we test for afterwards. i.e.
3711/// NEG (CMP CCMP CCCMP ...) can be implemented.
3712/// - Note that we can only ever negate all previously processed results.
3713/// What we can not implement by flipping the flags to test is a negation
3714/// of two sub-trees (because the negation affects all sub-trees emitted so
3715/// far, so the 2nd sub-tree we emit would also affect the first).
3716/// With those tools we can implement some OR operations:
3717/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3718/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3719/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3720/// elimination rules from earlier to implement the whole thing as a
3721/// CCMP/FCCMP chain.
3722///
3723/// As complete example:
3724/// or (or (setCA (cmp A)) (setCB (cmp B)))
3725/// (and (setCC (cmp C)) (setCD (cmp D)))"
3726/// can be reassociated to:
3727/// or (and (setCC (cmp C)) setCD (cmp D))
3728// (or (setCA (cmp A)) (setCB (cmp B)))
3729/// can be transformed to:
3730/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3731/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3732/// which can be implemented as:
3733/// cmp C
3734/// ccmp D, inv(CD), CC
3735/// ccmp A, CA, inv(CD)
3736/// ccmp B, CB, inv(CA)
3737/// check for CB flags
3738///
3739/// A counterexample is "or (and A B) (and C D)" which translates to
3740/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3741/// can only implement 1 of the inner (not) operations, but not both!
3742/// @{
3743
3744/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3746 ISD::CondCode CC, SDValue CCOp,
3748 AArch64CC::CondCode OutCC,
3749 const SDLoc &DL, SelectionDAG &DAG) {
3750 unsigned Opcode = 0;
3751 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3752
3753 if (LHS.getValueType().isFloatingPoint()) {
3754 assert(LHS.getValueType() != MVT::f128);
3755 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3756 LHS.getValueType() == MVT::bf16) {
3757 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3758 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3759 }
3760 Opcode = AArch64ISD::FCCMP;
3761 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3762 APInt Imm = Const->getAPIntValue();
3763 if (Imm.isNegative() && Imm.sgt(-32)) {
3764 Opcode = AArch64ISD::CCMN;
3765 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3766 }
3767 } else if (isCMN(RHS, CC, DAG)) {
3768 Opcode = AArch64ISD::CCMN;
3769 RHS = RHS.getOperand(1);
3770 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3771 isIntEqualitySetCC(CC)) {
3772 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3773 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3774 Opcode = AArch64ISD::CCMN;
3775 LHS = LHS.getOperand(1);
3776 }
3777 if (Opcode == 0)
3778 Opcode = AArch64ISD::CCMP;
3779
3780 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3782 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3783 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3784 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3785}
3786
3787/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3788/// expressed as a conjunction. See \ref AArch64CCMP.
3789/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3790/// changing the conditions on the SETCC tests.
3791/// (this means we can call emitConjunctionRec() with
3792/// Negate==true on this sub-tree)
3793/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3794/// cannot do the negation naturally. We are required to
3795/// emit the subtree first in this case.
3796/// \param WillNegate Is true if are called when the result of this
3797/// subexpression must be negated. This happens when the
3798/// outer expression is an OR. We can use this fact to know
3799/// that we have a double negation (or (or ...) ...) that
3800/// can be implemented for free.
3801static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3802 bool &MustBeFirst, bool WillNegate,
3803 unsigned Depth = 0) {
3804 if (!Val.hasOneUse())
3805 return false;
3806 unsigned Opcode = Val->getOpcode();
3807 if (Opcode == ISD::SETCC) {
3808 if (Val->getOperand(0).getValueType() == MVT::f128)
3809 return false;
3810 CanNegate = true;
3811 MustBeFirst = false;
3812 return true;
3813 }
3814 // Protect against exponential runtime and stack overflow.
3815 if (Depth > 6)
3816 return false;
3817 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3818 bool IsOR = Opcode == ISD::OR;
3819 SDValue O0 = Val->getOperand(0);
3820 SDValue O1 = Val->getOperand(1);
3821 bool CanNegateL;
3822 bool MustBeFirstL;
3823 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3824 return false;
3825 bool CanNegateR;
3826 bool MustBeFirstR;
3827 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3828 return false;
3829
3830 if (MustBeFirstL && MustBeFirstR)
3831 return false;
3832
3833 if (IsOR) {
3834 // For an OR expression we need to be able to naturally negate at least
3835 // one side or we cannot do the transformation at all.
3836 if (!CanNegateL && !CanNegateR)
3837 return false;
3838 // If we the result of the OR will be negated and we can naturally negate
3839 // the leafs, then this sub-tree as a whole negates naturally.
3840 CanNegate = WillNegate && CanNegateL && CanNegateR;
3841 // If we cannot naturally negate the whole sub-tree, then this must be
3842 // emitted first.
3843 MustBeFirst = !CanNegate;
3844 } else {
3845 assert(Opcode == ISD::AND && "Must be OR or AND");
3846 // We cannot naturally negate an AND operation.
3847 CanNegate = false;
3848 MustBeFirst = MustBeFirstL || MustBeFirstR;
3849 }
3850 return true;
3851 }
3852 return false;
3853}
3854
3855/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3856/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3857/// Tries to transform the given i1 producing node @p Val to a series compare
3858/// and conditional compare operations. @returns an NZCV flags producing node
3859/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3860/// transformation was not possible.
3861/// \p Negate is true if we want this sub-tree being negated just by changing
3862/// SETCC conditions.
3864 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3866 // We're at a tree leaf, produce a conditional comparison operation.
3867 unsigned Opcode = Val->getOpcode();
3868 if (Opcode == ISD::SETCC) {
3869 SDValue LHS = Val->getOperand(0);
3870 SDValue RHS = Val->getOperand(1);
3871 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3872 bool isInteger = LHS.getValueType().isInteger();
3873 if (Negate)
3874 CC = getSetCCInverse(CC, LHS.getValueType());
3875 SDLoc DL(Val);
3876 // Determine OutCC and handle FP special case.
3877 if (isInteger) {
3878 OutCC = changeIntCCToAArch64CC(CC);
3879 } else {
3880 assert(LHS.getValueType().isFloatingPoint());
3881 AArch64CC::CondCode ExtraCC;
3882 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3883 // Some floating point conditions can't be tested with a single condition
3884 // code. Construct an additional comparison in this case.
3885 if (ExtraCC != AArch64CC::AL) {
3886 SDValue ExtraCmp;
3887 if (!CCOp.getNode())
3888 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3889 else
3890 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3891 ExtraCC, DL, DAG);
3892 CCOp = ExtraCmp;
3893 Predicate = ExtraCC;
3894 }
3895 }
3896
3897 // Produce a normal comparison if we are first in the chain
3898 if (!CCOp)
3899 return emitComparison(LHS, RHS, CC, DL, DAG);
3900 // Otherwise produce a ccmp.
3901 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3902 DAG);
3903 }
3904 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3905
3906 bool IsOR = Opcode == ISD::OR;
3907
3908 SDValue LHS = Val->getOperand(0);
3909 bool CanNegateL;
3910 bool MustBeFirstL;
3911 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3912 assert(ValidL && "Valid conjunction/disjunction tree");
3913 (void)ValidL;
3914
3915 SDValue RHS = Val->getOperand(1);
3916 bool CanNegateR;
3917 bool MustBeFirstR;
3918 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3919 assert(ValidR && "Valid conjunction/disjunction tree");
3920 (void)ValidR;
3921
3922 // Swap sub-tree that must come first to the right side.
3923 if (MustBeFirstL) {
3924 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3925 std::swap(LHS, RHS);
3926 std::swap(CanNegateL, CanNegateR);
3927 std::swap(MustBeFirstL, MustBeFirstR);
3928 }
3929
3930 bool NegateR;
3931 bool NegateAfterR;
3932 bool NegateL;
3933 bool NegateAfterAll;
3934 if (Opcode == ISD::OR) {
3935 // Swap the sub-tree that we can negate naturally to the left.
3936 if (!CanNegateL) {
3937 assert(CanNegateR && "at least one side must be negatable");
3938 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3939 assert(!Negate);
3940 std::swap(LHS, RHS);
3941 NegateR = false;
3942 NegateAfterR = true;
3943 } else {
3944 // Negate the left sub-tree if possible, otherwise negate the result.
3945 NegateR = CanNegateR;
3946 NegateAfterR = !CanNegateR;
3947 }
3948 NegateL = true;
3949 NegateAfterAll = !Negate;
3950 } else {
3951 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3952 assert(!Negate && "Valid conjunction/disjunction tree");
3953
3954 NegateL = false;
3955 NegateR = false;
3956 NegateAfterR = false;
3957 NegateAfterAll = false;
3958 }
3959
3960 // Emit sub-trees.
3961 AArch64CC::CondCode RHSCC;
3962 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3963 if (NegateAfterR)
3964 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3965 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3966 if (NegateAfterAll)
3967 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3968 return CmpL;
3969}
3970
3971/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3972/// In some cases this is even possible with OR operations in the expression.
3973/// See \ref AArch64CCMP.
3974/// \see emitConjunctionRec().
3976 AArch64CC::CondCode &OutCC) {
3977 bool DummyCanNegate;
3978 bool DummyMustBeFirst;
3979 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3980 return SDValue();
3981
3982 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3983}
3984
3985/// @}
3986
3987/// Returns how profitable it is to fold a comparison's operand's shift and/or
3988/// extension operations.
3990 auto isSupportedExtend = [&](SDValue V) {
3991 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3992 return true;
3993
3994 if (V.getOpcode() == ISD::AND)
3995 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3996 uint64_t Mask = MaskCst->getZExtValue();
3997 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3998 }
3999
4000 return false;
4001 };
4002
4003 if (!Op.hasOneUse())
4004 return 0;
4005
4006 if (isSupportedExtend(Op))
4007 return 1;
4008
4009 unsigned Opc = Op.getOpcode();
4010 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4011 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4012 uint64_t Shift = ShiftCst->getZExtValue();
4013 if (isSupportedExtend(Op.getOperand(0)))
4014 return (Shift <= 4) ? 2 : 1;
4015 EVT VT = Op.getValueType();
4016 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4017 return 1;
4018 }
4019
4020 return 0;
4021}
4022
4024 SDValue &AArch64cc, SelectionDAG &DAG,
4025 const SDLoc &dl) {
4026 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4027 EVT VT = RHS.getValueType();
4028 uint64_t C = RHSC->getZExtValue();
4029 if (!isLegalArithImmed(C)) {
4030 // Constant does not fit, try adjusting it by one?
4031 switch (CC) {
4032 default:
4033 break;
4034 case ISD::SETLT:
4035 case ISD::SETGE:
4036 if ((VT == MVT::i32 && C != 0x80000000 &&
4037 isLegalArithImmed((uint32_t)(C - 1))) ||
4038 (VT == MVT::i64 && C != 0x80000000ULL &&
4039 isLegalArithImmed(C - 1ULL))) {
4041 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4042 RHS = DAG.getConstant(C, dl, VT);
4043 }
4044 break;
4045 case ISD::SETULT:
4046 case ISD::SETUGE:
4047 if ((VT == MVT::i32 && C != 0 &&
4048 isLegalArithImmed((uint32_t)(C - 1))) ||
4049 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
4051 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
4052 RHS = DAG.getConstant(C, dl, VT);
4053 }
4054 break;
4055 case ISD::SETLE:
4056 case ISD::SETGT:
4057 if ((VT == MVT::i32 && C != INT32_MAX &&
4058 isLegalArithImmed((uint32_t)(C + 1))) ||
4059 (VT == MVT::i64 && C != INT64_MAX &&
4060 isLegalArithImmed(C + 1ULL))) {
4062 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4063 RHS = DAG.getConstant(C, dl, VT);
4064 }
4065 break;
4066 case ISD::SETULE:
4067 case ISD::SETUGT:
4068 if ((VT == MVT::i32 && C != UINT32_MAX &&
4069 isLegalArithImmed((uint32_t)(C + 1))) ||
4070 (VT == MVT::i64 && C != UINT64_MAX &&
4071 isLegalArithImmed(C + 1ULL))) {
4073 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
4074 RHS = DAG.getConstant(C, dl, VT);
4075 }
4076 break;
4077 }
4078 }
4079 }
4080
4081 // Comparisons are canonicalized so that the RHS operand is simpler than the
4082 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4083 // can fold some shift+extend operations on the RHS operand, so swap the
4084 // operands if that can be done.
4085 //
4086 // For example:
4087 // lsl w13, w11, #1
4088 // cmp w13, w12
4089 // can be turned into:
4090 // cmp w12, w11, lsl #1
4091 if (!isa<ConstantSDNode>(RHS) ||
4092 !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
4093 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4094 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4095 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4096 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4097
4098 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4099 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4100 std::swap(LHS, RHS);
4102 }
4103 }
4104
4105 SDValue Cmp;
4106 AArch64CC::CondCode AArch64CC;
4107 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4108 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
4109
4110 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4111 // For the i8 operand, the largest immediate is 255, so this can be easily
4112 // encoded in the compare instruction. For the i16 operand, however, the
4113 // largest immediate cannot be encoded in the compare.
4114 // Therefore, use a sign extending load and cmn to avoid materializing the
4115 // -1 constant. For example,
4116 // movz w1, #65535
4117 // ldrh w0, [x0, #0]
4118 // cmp w0, w1
4119 // >
4120 // ldrsh w0, [x0, #0]
4121 // cmn w0, #1
4122 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4123 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4124 // ensure both the LHS and RHS are truly zero extended and to make sure the
4125 // transformation is profitable.
4126 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4127 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4128 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4129 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4130 int16_t ValueofRHS = RHS->getAsZExtVal();
4131 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4132 SDValue SExt =
4133 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
4134 DAG.getValueType(MVT::i16));
4135 Cmp = emitComparison(
4136 SExt, DAG.getSignedConstant(ValueofRHS, dl, RHS.getValueType()), CC,
4137 dl, DAG);
4138 AArch64CC = changeIntCCToAArch64CC(CC);
4139 }
4140 }
4141
4142 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4143 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4144 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4145 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
4146 }
4147 }
4148 }
4149
4150 if (!Cmp) {
4151 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4152 AArch64CC = changeIntCCToAArch64CC(CC);
4153 }
4154 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
4155 return Cmp;
4156}
4157
4158static std::pair<SDValue, SDValue>
4160 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4161 "Unsupported value type");
4162 SDValue Value, Overflow;
4163 SDLoc DL(Op);
4164 SDValue LHS = Op.getOperand(0);
4165 SDValue RHS = Op.getOperand(1);
4166 unsigned Opc = 0;
4167 switch (Op.getOpcode()) {
4168 default:
4169 llvm_unreachable("Unknown overflow instruction!");
4170 case ISD::SADDO:
4171 Opc = AArch64ISD::ADDS;
4172 CC = AArch64CC::VS;
4173 break;
4174 case ISD::UADDO:
4175 Opc = AArch64ISD::ADDS;
4176 CC = AArch64CC::HS;
4177 break;
4178 case ISD::SSUBO:
4179 Opc = AArch64ISD::SUBS;
4180 CC = AArch64CC::VS;
4181 break;
4182 case ISD::USUBO:
4183 Opc = AArch64ISD::SUBS;
4184 CC = AArch64CC::LO;
4185 break;
4186 // Multiply needs a little bit extra work.
4187 case ISD::SMULO:
4188 case ISD::UMULO: {
4189 CC = AArch64CC::NE;
4190 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4191 if (Op.getValueType() == MVT::i32) {
4192 // Extend to 64-bits, then perform a 64-bit multiply.
4193 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4194 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4195 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4196 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4197 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4198
4199 // Check that the result fits into a 32-bit integer.
4200 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
4201 if (IsSigned) {
4202 // cmp xreg, wreg, sxtw
4203 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4204 Overflow =
4205 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4206 } else {
4207 // tst xreg, #0xffffffff00000000
4208 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4209 Overflow =
4210 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4211 }
4212 break;
4213 }
4214 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4215 // For the 64 bit multiply
4216 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4217 if (IsSigned) {
4218 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4219 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4220 DAG.getConstant(63, DL, MVT::i64));
4221 // It is important that LowerBits is last, otherwise the arithmetic
4222 // shift will not be folded into the compare (SUBS).
4223 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4224 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4225 .getValue(1);
4226 } else {
4227 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4228 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4229 Overflow =
4230 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4231 DAG.getConstant(0, DL, MVT::i64),
4232 UpperBits).getValue(1);
4233 }
4234 break;
4235 }
4236 } // switch (...)
4237
4238 if (Opc) {
4239 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4240
4241 // Emit the AArch64 operation with overflow check.
4242 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4243 Overflow = Value.getValue(1);
4244 }
4245 return std::make_pair(Value, Overflow);
4246}
4247
4248SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4249 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4250 !Subtarget->isNeonAvailable()))
4251 return LowerToScalableOp(Op, DAG);
4252
4253 SDValue Sel = Op.getOperand(0);
4254 SDValue Other = Op.getOperand(1);
4255 SDLoc dl(Sel);
4256
4257 // If the operand is an overflow checking operation, invert the condition
4258 // code and kill the Not operation. I.e., transform:
4259 // (xor (overflow_op_bool, 1))
4260 // -->
4261 // (csel 1, 0, invert(cc), overflow_op_bool)
4262 // ... which later gets transformed to just a cset instruction with an
4263 // inverted condition code, rather than a cset + eor sequence.
4265 // Only lower legal XALUO ops.
4267 return SDValue();
4268
4269 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4270 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4272 SDValue Value, Overflow;
4273 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4274 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4275 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4276 CCVal, Overflow);
4277 }
4278 // If neither operand is a SELECT_CC, give up.
4279 if (Sel.getOpcode() != ISD::SELECT_CC)
4280 std::swap(Sel, Other);
4281 if (Sel.getOpcode() != ISD::SELECT_CC)
4282 return Op;
4283
4284 // The folding we want to perform is:
4285 // (xor x, (select_cc a, b, cc, 0, -1) )
4286 // -->
4287 // (csel x, (xor x, -1), cc ...)
4288 //
4289 // The latter will get matched to a CSINV instruction.
4290
4291 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4292 SDValue LHS = Sel.getOperand(0);
4293 SDValue RHS = Sel.getOperand(1);
4294 SDValue TVal = Sel.getOperand(2);
4295 SDValue FVal = Sel.getOperand(3);
4296
4297 // FIXME: This could be generalized to non-integer comparisons.
4298 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4299 return Op;
4300
4301 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4302 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4303
4304 // The values aren't constants, this isn't the pattern we're looking for.
4305 if (!CFVal || !CTVal)
4306 return Op;
4307
4308 // We can commute the SELECT_CC by inverting the condition. This
4309 // might be needed to make this fit into a CSINV pattern.
4310 if (CTVal->isAllOnes() && CFVal->isZero()) {
4311 std::swap(TVal, FVal);
4312 std::swap(CTVal, CFVal);
4313 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4314 }
4315
4316 // If the constants line up, perform the transform!
4317 if (CTVal->isZero() && CFVal->isAllOnes()) {
4318 SDValue CCVal;
4319 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4320
4321 FVal = Other;
4322 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4323 DAG.getAllOnesConstant(dl, Other.getValueType()));
4324
4325 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4326 CCVal, Cmp);
4327 }
4328
4329 return Op;
4330}
4331
4332// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4333// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4334// sets 'C' bit to 0.
4336 SDLoc DL(Value);
4337 EVT VT = Value.getValueType();
4338 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4339 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4340 SDValue Cmp =
4341 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4342 return Cmp.getValue(1);
4343}
4344
4345// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4346// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4348 bool Invert) {
4349 assert(Glue.getResNo() == 1);
4350 SDLoc DL(Glue);
4351 SDValue Zero = DAG.getConstant(0, DL, VT);
4352 SDValue One = DAG.getConstant(1, DL, VT);
4353 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4354 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4355 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4356}
4357
4358// Value is 1 if 'V' bit of NZCV is 1, else 0
4360 assert(Glue.getResNo() == 1);
4361 SDLoc DL(Glue);
4362 SDValue Zero = DAG.getConstant(0, DL, VT);
4363 SDValue One = DAG.getConstant(1, DL, VT);
4364 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4365 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4366}
4367
4368// This lowering is inefficient, but it will get cleaned up by
4369// `foldOverflowCheck`
4371 unsigned Opcode, bool IsSigned) {
4372 EVT VT0 = Op.getValue(0).getValueType();
4373 EVT VT1 = Op.getValue(1).getValueType();
4374
4375 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4376 return SDValue();
4377
4378 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4379 SDValue OpLHS = Op.getOperand(0);
4380 SDValue OpRHS = Op.getOperand(1);
4381 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4382
4383 SDLoc DL(Op);
4384 SDVTList VTs = DAG.getVTList(VT0, VT1);
4385
4386 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4387 OpRHS, OpCarryIn);
4388
4389 SDValue OutFlag =
4390 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4391 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4392
4393 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4394}
4395
4397 // Let legalize expand this if it isn't a legal type yet.
4398 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4399 return SDValue();
4400
4401 SDLoc dl(Op);
4403 // The actual operation that sets the overflow or carry flag.
4404 SDValue Value, Overflow;
4405 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4406
4407 // We use 0 and 1 as false and true values.
4408 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4409 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4410
4411 // We use an inverted condition, because the conditional select is inverted
4412 // too. This will allow it to be selected to a single instruction:
4413 // CSINC Wd, WZR, WZR, invert(cond).
4414 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4415 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4416 CCVal, Overflow);
4417
4418 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4419 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4420}
4421
4422// Prefetch operands are:
4423// 1: Address to prefetch
4424// 2: bool isWrite
4425// 3: int locality (0 = no locality ... 3 = extreme locality)
4426// 4: bool isDataCache
4428 SDLoc DL(Op);
4429 unsigned IsWrite = Op.getConstantOperandVal(2);
4430 unsigned Locality = Op.getConstantOperandVal(3);
4431 unsigned IsData = Op.getConstantOperandVal(4);
4432
4433 bool IsStream = !Locality;
4434 // When the locality number is set
4435 if (Locality) {
4436 // The front-end should have filtered out the out-of-range values
4437 assert(Locality <= 3 && "Prefetch locality out-of-range");
4438 // The locality degree is the opposite of the cache speed.
4439 // Put the number the other way around.
4440 // The encoding starts at 0 for level 1
4441 Locality = 3 - Locality;
4442 }
4443
4444 // built the mask value encoding the expected behavior.
4445 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4446 (!IsData << 3) | // IsDataCache bit
4447 (Locality << 1) | // Cache level bits
4448 (unsigned)IsStream; // Stream bit
4449 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4450 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4451 Op.getOperand(1));
4452}
4453
4454// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4455// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4456// (AND X Y) Z which produces a better opt with EmitComparison
4458 SelectionDAG &DAG, const SDLoc dl) {
4459 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4460 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4461 ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
4462 if (LHSConstOp && RHSConst) {
4463 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4464 uint64_t RHSConstant = RHSConst->getZExtValue();
4465 if (isPowerOf2_64(RHSConstant)) {
4466 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4467 LHS =
4468 DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
4469 DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
4470 RHS = DAG.getConstant(0, dl, RHS.getValueType());
4471 CC = ISD::SETEQ;
4472 }
4473 }
4474 }
4475}
4476
4477SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4478 SelectionDAG &DAG) const {
4479 EVT VT = Op.getValueType();
4480 if (VT.isScalableVector()) {
4481 SDValue SrcVal = Op.getOperand(0);
4482
4483 if (SrcVal.getValueType().getScalarType() == MVT::bf16) {
4484 // bf16 and f32 share the same exponent range so the conversion requires
4485 // them to be aligned with the new mantissa bits zero'd. This is just a
4486 // left shift that is best to isel directly.
4487 if (VT == MVT::nxv2f32 || VT == MVT::nxv4f32)
4488 return Op;
4489
4490 if (VT != MVT::nxv2f64)
4491 return SDValue();
4492
4493 // Break other conversions in two with the first part converting to f32
4494 // and the second using native f32->VT instructions.
4495 SDLoc DL(Op);
4496 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4497 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4498 }
4499
4500 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4501 }
4502
4503 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4504 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4505
4506 bool IsStrict = Op->isStrictFPOpcode();
4507 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4508 EVT Op0VT = Op0.getValueType();
4509 if (VT == MVT::f64) {
4510 // FP16->FP32 extends are legal for v32 and v4f32.
4511 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4512 return Op;
4513 // Split bf16->f64 extends into two fpextends.
4514 if (Op0VT == MVT::bf16 && IsStrict) {
4515 SDValue Ext1 =
4516 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4517 {Op0, Op.getOperand(0)});
4518 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4519 {Ext1, Ext1.getValue(1)});
4520 }
4521 if (Op0VT == MVT::bf16)
4522 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4523 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4524 return SDValue();
4525 }
4526
4527 if (VT.getScalarType() == MVT::f32) {
4528 // FP16->FP32 extends are legal for v32 and v4f32.
4529 if (Op0VT.getScalarType() == MVT::f16)
4530 return Op;
4531 if (Op0VT.getScalarType() == MVT::bf16) {
4532 SDLoc DL(Op);
4533 EVT IVT = VT.changeTypeToInteger();
4534 if (!Op0VT.isVector()) {
4535 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
4536 IVT = MVT::v4i32;
4537 }
4538
4539 EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
4540 SDValue Ext =
4541 DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
4542 SDValue Shift =
4543 DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
4544 if (!Op0VT.isVector())
4545 Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
4546 DAG.getConstant(0, DL, MVT::i64));
4547 Shift = DAG.getBitcast(VT, Shift);
4548 return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
4549 : Shift;
4550 }
4551 return SDValue();
4552 }
4553
4554 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4555 return SDValue();
4556}
4557
4558SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4559 SelectionDAG &DAG) const {
4560 EVT VT = Op.getValueType();
4561 bool IsStrict = Op->isStrictFPOpcode();
4562 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4563 EVT SrcVT = SrcVal.getValueType();
4564 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4565
4566 if (VT.isScalableVector()) {
4567 if (VT.getScalarType() != MVT::bf16)
4568 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4569
4570 SDLoc DL(Op);
4571 constexpr EVT I32 = MVT::nxv4i32;
4572 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4573
4574 SDValue NaN;
4575 SDValue Narrow;
4576
4577 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4578 if (Subtarget->hasBF16())
4579 return LowerToPredicatedOp(Op, DAG,
4581
4582 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4583
4584 // Set the quiet bit.
4585 if (!DAG.isKnownNeverSNaN(SrcVal))
4586 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4587 } else if (SrcVT == MVT::nxv2f64 &&
4588 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4589 // Round to float without introducing rounding errors and try again.
4590 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4591 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4592 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4593
4595 if (IsStrict)
4596 NewOps.push_back(Op.getOperand(0));
4597 NewOps.push_back(Narrow);
4598 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4599 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4600 } else
4601 return SDValue();
4602
4603 if (!Trunc) {
4604 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4605 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4606 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4607 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4608 }
4609
4610 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4611 // 0x80000000.
4612 if (NaN) {
4613 EVT I1 = I32.changeElementType(MVT::i1);
4614 EVT CondVT = VT.changeElementType(MVT::i1);
4615 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4616 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4617 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4618 }
4619
4620 // Now that we have rounded, shift the bits into position.
4621 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4622 return getSVESafeBitCast(VT, Narrow, DAG);
4623 }
4624
4625 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4626 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4627
4628 // Expand cases where the result type is BF16 but we don't have hardware
4629 // instructions to lower it.
4630 if (VT.getScalarType() == MVT::bf16 &&
4631 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4632 Subtarget->hasBF16())) {
4633 SDLoc dl(Op);
4634 SDValue Narrow = SrcVal;
4635 SDValue NaN;
4636 EVT I32 = SrcVT.changeElementType(MVT::i32);
4637 EVT F32 = SrcVT.changeElementType(MVT::f32);
4638 if (SrcVT.getScalarType() == MVT::f32) {
4639 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4640 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4641 if (!NeverSNaN) {
4642 // Set the quiet bit.
4643 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4644 DAG.getConstant(0x400000, dl, I32));
4645 }
4646 } else if (SrcVT.getScalarType() == MVT::f64) {
4647 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4648 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4649 } else {
4650 return SDValue();
4651 }
4652 if (!Trunc) {
4653 SDValue One = DAG.getConstant(1, dl, I32);
4654 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4655 DAG.getShiftAmountConstant(16, I32, dl));
4656 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4657 SDValue RoundingBias =
4658 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4659 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4660 }
4661
4662 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4663 // 0x80000000.
4664 if (NaN) {
4665 SDValue IsNaN = DAG.getSetCC(
4666 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4667 SrcVal, SrcVal, ISD::SETUO);
4668 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4669 }
4670
4671 // Now that we have rounded, shift the bits into position.
4672 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4673 DAG.getShiftAmountConstant(16, I32, dl));
4674 if (VT.isVector()) {
4675 EVT I16 = I32.changeVectorElementType(MVT::i16);
4676 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4677 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4678 }
4679 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4680 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4681 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4682 : Result;
4683 }
4684
4685 if (SrcVT != MVT::f128) {
4686 // Expand cases where the input is a vector bigger than NEON.
4688 return SDValue();
4689
4690 // It's legal except when f128 is involved
4691 return Op;
4692 }
4693
4694 return SDValue();
4695}
4696
4697SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4698 SelectionDAG &DAG) const {
4699 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4700 // Any additional optimization in this function should be recorded
4701 // in the cost tables.
4702 bool IsStrict = Op->isStrictFPOpcode();
4703 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4704 EVT VT = Op.getValueType();
4705
4706 if (VT.isScalableVector()) {
4707 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4710 return LowerToPredicatedOp(Op, DAG, Opcode);
4711 }
4712
4713 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4714 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4715 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4716
4717 unsigned NumElts = InVT.getVectorNumElements();
4718
4719 // f16 conversions are promoted to f32 when full fp16 is not supported.
4720 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4721 InVT.getVectorElementType() == MVT::bf16) {
4722 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4723 SDLoc dl(Op);
4724 if (IsStrict) {
4725 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4726 {Op.getOperand(0), Op.getOperand(1)});
4727 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4728 {Ext.getValue(1), Ext.getValue(0)});
4729 }
4730 return DAG.getNode(
4731 Op.getOpcode(), dl, Op.getValueType(),
4732 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4733 }
4734
4735 uint64_t VTSize = VT.getFixedSizeInBits();
4736 uint64_t InVTSize = InVT.getFixedSizeInBits();
4737 if (VTSize < InVTSize) {
4738 SDLoc dl(Op);
4739 if (IsStrict) {
4741 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4742 {Op.getOperand(0), Op.getOperand(1)});
4743 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4744 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4745 }
4746 SDValue Cv =
4747 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4748 Op.getOperand(0));
4749 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4750 }
4751
4752 if (VTSize > InVTSize) {
4753 SDLoc dl(Op);
4754 MVT ExtVT =
4757 if (IsStrict) {
4758 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4759 {Op.getOperand(0), Op.getOperand(1)});
4760 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4761 {Ext.getValue(1), Ext.getValue(0)});
4762 }
4763 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4764 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4765 }
4766
4767 // Use a scalar operation for conversions between single-element vectors of
4768 // the same size.
4769 if (NumElts == 1) {
4770 SDLoc dl(Op);
4771 SDValue Extract = DAG.getNode(
4773 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4774 EVT ScalarVT = VT.getScalarType();
4775 if (IsStrict)
4776 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4777 {Op.getOperand(0), Extract});
4778 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4779 }
4780
4781 // Type changing conversions are illegal.
4782 return Op;
4783}
4784
4785SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4786 SelectionDAG &DAG) const {
4787 bool IsStrict = Op->isStrictFPOpcode();
4788 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4789
4790 if (SrcVal.getValueType().isVector())
4791 return LowerVectorFP_TO_INT(Op, DAG);
4792
4793 // f16 conversions are promoted to f32 when full fp16 is not supported.
4794 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4795 SrcVal.getValueType() == MVT::bf16) {
4796 SDLoc dl(Op);
4797 if (IsStrict) {
4798 SDValue Ext =
4799 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4800 {Op.getOperand(0), SrcVal});
4801 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4802 {Ext.getValue(1), Ext.getValue(0)});
4803 }
4804 return DAG.getNode(
4805 Op.getOpcode(), dl, Op.getValueType(),
4806 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4807 }
4808
4809 if (SrcVal.getValueType() != MVT::f128) {
4810 // It's legal except when f128 is involved
4811 return Op;
4812 }
4813
4814 return SDValue();
4815}
4816
4817SDValue
4818AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4819 SelectionDAG &DAG) const {
4820 // AArch64 FP-to-int conversions saturate to the destination element size, so
4821 // we can lower common saturating conversions to simple instructions.
4822 SDValue SrcVal = Op.getOperand(0);
4823 EVT SrcVT = SrcVal.getValueType();
4824 EVT DstVT = Op.getValueType();
4825 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4826
4827 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4828 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4829 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4830 assert(SatWidth <= DstElementWidth &&
4831 "Saturation width cannot exceed result width");
4832
4833 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4834 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4835 // types, so this is hard to reach.
4836 if (DstVT.isScalableVector())
4837 return SDValue();
4838
4839 EVT SrcElementVT = SrcVT.getVectorElementType();
4840
4841 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4842 SDLoc DL(Op);
4843 SDValue SrcVal2;
4844 if ((SrcElementVT == MVT::f16 &&
4845 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4846 SrcElementVT == MVT::bf16) {
4847 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4848 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4849 // If we are extending to a v8f32, split into two v4f32 to produce legal
4850 // types.
4851 if (F32VT.getSizeInBits() > 128) {
4852 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4853 F32VT = F32VT.getHalfNumVectorElementsVT();
4854 }
4855 SrcVT = F32VT;
4856 SrcElementVT = MVT::f32;
4857 SrcElementWidth = 32;
4858 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4859 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4860 return SDValue();
4861
4862 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4863 // width and produce a fcvtzu.
4864 if (SatWidth == 64 && SrcElementWidth < 64) {
4865 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4866 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4867 SrcVT = F64VT;
4868 SrcElementVT = MVT::f64;
4869 SrcElementWidth = 64;
4870 }
4871 // Cases that we can emit directly.
4872 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4873 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4874 DAG.getValueType(DstVT.getScalarType()));
4875 if (SrcVal2) {
4876 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4877 DAG.getValueType(DstVT.getScalarType()));
4878 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4879 }
4880 return Res;
4881 }
4882
4883 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4884 // result. This is only valid if the legal cvt is larger than the saturate
4885 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4886 // (at least until sqxtn is selected).
4887 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4888 return SDValue();
4889
4890 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4891 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4892 DAG.getValueType(IntVT.getScalarType()));
4893 SDValue NativeCvt2 =
4894 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4895 DAG.getValueType(IntVT.getScalarType()))
4896 : SDValue();
4897 SDValue Sat, Sat2;
4898 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4899 SDValue MinC = DAG.getConstant(
4900 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4901 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4902 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4903 SDValue MaxC = DAG.getConstant(
4904 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4905 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4906 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4907 } else {
4908 SDValue MinC = DAG.getConstant(
4909 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4910 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4911 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4912 }
4913
4914 if (SrcVal2)
4915 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4917 Sat, Sat2);
4918
4919 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4920}
4921
4922SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4923 SelectionDAG &DAG) const {
4924 // AArch64 FP-to-int conversions saturate to the destination register size, so
4925 // we can lower common saturating conversions to simple instructions.
4926 SDValue SrcVal = Op.getOperand(0);
4927 EVT SrcVT = SrcVal.getValueType();
4928
4929 if (SrcVT.isVector())
4930 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4931
4932 EVT DstVT = Op.getValueType();
4933 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4934 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4935 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4936 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4937
4938 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4939 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4940 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4941 SrcVT = MVT::f32;
4942 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4943 SrcVT != MVT::bf16)
4944 return SDValue();
4945
4946 SDLoc DL(Op);
4947 // Cases that we can emit directly.
4948 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4949 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4950 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4951 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4952 DAG.getValueType(DstVT));
4953
4954 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4955 // result. This is only valid if the legal cvt is larger than the saturate
4956 // width.
4957 if (DstWidth < SatWidth)
4958 return SDValue();
4959
4960 SDValue NativeCvt =
4961 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4962 SDValue Sat;
4963 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4964 SDValue MinC = DAG.getConstant(
4965 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4966 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4967 SDValue MaxC = DAG.getConstant(
4968 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4969 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4970 } else {
4971 SDValue MinC = DAG.getConstant(
4972 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4973 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4974 }
4975
4976 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4977}
4978
4979SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4980 SelectionDAG &DAG) const {
4981 EVT VT = Op.getValueType();
4982 SDValue Src = Op.getOperand(0);
4983 SDLoc DL(Op);
4984
4985 assert(VT.isVector() && "Expected vector type");
4986
4987 EVT CastVT =
4988 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4989
4990 // Round the floating-point value into a floating-point register with the
4991 // current rounding mode.
4992 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4993
4994 // Truncate the rounded floating point to an integer.
4995 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4997}
4998
4999SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5000 SelectionDAG &DAG) const {
5001 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5002 // Any additional optimization in this function should be recorded
5003 // in the cost tables.
5004 bool IsStrict = Op->isStrictFPOpcode();
5005 EVT VT = Op.getValueType();
5006 SDLoc dl(Op);
5007 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5008 EVT InVT = In.getValueType();
5009 unsigned Opc = Op.getOpcode();
5010 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5011
5012 if (VT.isScalableVector()) {
5013 if (InVT.getVectorElementType() == MVT::i1) {
5014 // We can't directly extend an SVE predicate; extend it first.
5015 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5016 EVT CastVT = getPromotedVTForPredicate(InVT);
5017 In = DAG.getNode(CastOpc, dl, CastVT, In);
5018 return DAG.getNode(Opc, dl, VT, In);
5019 }
5020
5021 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5023 return LowerToPredicatedOp(Op, DAG, Opcode);
5024 }
5025
5026 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5027 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5028 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5029
5030 // Promote bf16 conversions to f32.
5031 if (VT.getVectorElementType() == MVT::bf16) {
5032 EVT F32 = VT.changeElementType(MVT::f32);
5033 if (IsStrict) {
5034 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
5035 {Op.getOperand(0), In});
5036 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5037 {Op.getValueType(), MVT::Other},
5038 {Val.getValue(1), Val.getValue(0),
5039 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5040 }
5041 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5042 DAG.getNode(Op.getOpcode(), dl, F32, In),
5043 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5044 }
5045
5046 uint64_t VTSize = VT.getFixedSizeInBits();
5047 uint64_t InVTSize = InVT.getFixedSizeInBits();
5048 if (VTSize < InVTSize) {
5049 MVT CastVT =
5051 InVT.getVectorNumElements());
5052 if (IsStrict) {
5053 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
5054 {Op.getOperand(0), In});
5055 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
5056 {In.getValue(1), In.getValue(0),
5057 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5058 }
5059 In = DAG.getNode(Opc, dl, CastVT, In);
5060 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
5061 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5062 }
5063
5064 if (VTSize > InVTSize) {
5065 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5067 In = DAG.getNode(CastOpc, dl, CastVT, In);
5068 if (IsStrict)
5069 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
5070 return DAG.getNode(Opc, dl, VT, In);
5071 }
5072
5073 // Use a scalar operation for conversions between single-element vectors of
5074 // the same size.
5075 if (VT.getVectorNumElements() == 1) {
5076 SDValue Extract = DAG.getNode(
5078 In, DAG.getConstant(0, dl, MVT::i64));
5079 EVT ScalarVT = VT.getScalarType();
5080 if (IsStrict)
5081 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
5082 {Op.getOperand(0), Extract});
5083 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
5084 }
5085
5086 return Op;
5087}
5088
5089SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5090 SelectionDAG &DAG) const {
5091 if (Op.getValueType().isVector())
5092 return LowerVectorINT_TO_FP(Op, DAG);
5093
5094 bool IsStrict = Op->isStrictFPOpcode();
5095 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5096
5097 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5098 Op->getOpcode() == ISD::SINT_TO_FP;
5099
5100 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5101 SDLoc dl(Op);
5102 if (IsStrict) {
5103 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
5104 {Op.getOperand(0), SrcVal});
5105 return DAG.getNode(ISD::STRICT_FP_ROUND, dl,
5106 {Op.getValueType(), MVT::Other},
5107 {Val.getValue(1), Val.getValue(0),
5108 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
5109 }
5110 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
5111 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
5112 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
5113 };
5114
5115 if (Op.getValueType() == MVT::bf16) {
5116 unsigned MaxWidth = IsSigned
5117 ? DAG.ComputeMaxSignificantBits(SrcVal)
5118 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5119 // bf16 conversions are promoted to f32 when converting from i16.
5120 if (MaxWidth <= 24) {
5121 return IntToFpViaPromotion(MVT::f32);
5122 }
5123
5124 // bf16 conversions are promoted to f64 when converting from i32.
5125 if (MaxWidth <= 53) {
5126 return IntToFpViaPromotion(MVT::f64);
5127 }
5128
5129 // We need to be careful about i64 -> bf16.
5130 // Consider an i32 22216703.
5131 // This number cannot be represented exactly as an f32 and so a itofp will
5132 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5133 // However, the correct bf16 was supposed to be 22151168.0
5134 // We need to use sticky rounding to get this correct.
5135 if (SrcVal.getValueType() == MVT::i64) {
5136 SDLoc DL(Op);
5137 // This algorithm is equivalent to the following:
5138 // uint64_t SrcHi = SrcVal & ~0xfffull;
5139 // uint64_t SrcLo = SrcVal & 0xfffull;
5140 // uint64_t Highest = SrcVal >> 53;
5141 // bool HasHighest = Highest != 0;
5142 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5143 // double Rounded = static_cast<double>(ToRound);
5144 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5145 // uint64_t HasLo = SrcLo != 0;
5146 // bool NeedsAdjustment = HasHighest & HasLo;
5147 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5148 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5149 // return static_cast<__bf16>(Adjusted);
5150 //
5151 // Essentially, what happens is that SrcVal either fits perfectly in a
5152 // double-precision value or it is too big. If it is sufficiently small,
5153 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5154 // ensure that u64 -> double has no rounding error by only using the 52
5155 // MSB of the input. The low order bits will get merged into a sticky bit
5156 // which will avoid issues incurred by double rounding.
5157
5158 // Signed conversion is more or less like so:
5159 // copysign((__bf16)abs(SrcVal), SrcVal)
5160 SDValue SignBit;
5161 if (IsSigned) {
5162 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5163 DAG.getConstant(1ull << 63, DL, MVT::i64));
5164 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5165 }
5166 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5167 DAG.getConstant(~0xfffull, DL, MVT::i64));
5168 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5169 DAG.getConstant(0xfffull, DL, MVT::i64));
5171 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5172 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5173 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5174 SDValue ToRound =
5175 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5176 SDValue Rounded =
5177 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5178 {Op.getOperand(0), ToRound})
5179 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5180
5181 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5182 if (SignBit) {
5183 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5184 }
5185
5186 SDValue HasHighest = DAG.getSetCC(
5187 DL,
5188 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5189 Highest, Zero64, ISD::SETNE);
5190
5191 SDValue HasLo = DAG.getSetCC(
5192 DL,
5193 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5194 SrcLo, Zero64, ISD::SETNE);
5195
5196 SDValue NeedsAdjustment =
5197 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5198 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5199
5200 SDValue AdjustedBits =
5201 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5202 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5203 return IsStrict
5204 ? DAG.getNode(
5206 {Op.getValueType(), MVT::Other},
5207 {Rounded.getValue(1), Adjusted,
5208 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5209 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5210 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5211 }
5212 }
5213
5214 // f16 conversions are promoted to f32 when full fp16 is not supported.
5215 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5216 return IntToFpViaPromotion(MVT::f32);
5217 }
5218
5219 // i128 conversions are libcalls.
5220 if (SrcVal.getValueType() == MVT::i128)
5221 return SDValue();
5222
5223 // Other conversions are legal, unless it's to the completely software-based
5224 // fp128.
5225 if (Op.getValueType() != MVT::f128)
5226 return Op;
5227 return SDValue();
5228}
5229
5230SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5231 SelectionDAG &DAG) const {
5232 // For iOS, we want to call an alternative entry point: __sincos_stret,
5233 // which returns the values in two S / D registers.
5234 SDLoc dl(Op);
5235 SDValue Arg = Op.getOperand(0);
5236 EVT ArgVT = Arg.getValueType();
5237 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5238
5240 ArgListEntry Entry;
5241
5242 Entry.Node = Arg;
5243 Entry.Ty = ArgTy;
5244 Entry.IsSExt = false;
5245 Entry.IsZExt = false;
5246 Args.push_back(Entry);
5247
5248 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5249 : RTLIB::SINCOS_STRET_F32;
5250 const char *LibcallName = getLibcallName(LC);
5251 SDValue Callee =
5252 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5253
5254 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5256 CLI.setDebugLoc(dl)
5257 .setChain(DAG.getEntryNode())
5258 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
5259
5260 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5261 return CallResult.first;
5262}
5263
5264static MVT getSVEContainerType(EVT ContentTy);
5265
5266SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5267 SelectionDAG &DAG) const {
5268 EVT OpVT = Op.getValueType();
5269 EVT ArgVT = Op.getOperand(0).getValueType();
5270
5272 return LowerFixedLengthBitcastToSVE(Op, DAG);
5273
5274 if (OpVT.isScalableVector()) {
5275 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5276
5277 // Handle type legalisation first.
5278 if (!isTypeLegal(ArgVT)) {
5279 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5280 "Expected int->fp bitcast!");
5281
5282 // Bitcasting between unpacked vector types of different element counts is
5283 // not a NOP because the live elements are laid out differently.
5284 // 01234567
5285 // e.g. nxv2i32 = XX??XX??
5286 // nxv4f16 = X?X?X?X?
5287 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5288 return SDValue();
5289
5290 SDValue ExtResult =
5292 Op.getOperand(0));
5293 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5294 }
5295
5296 // Bitcasts between legal types with the same element count are legal.
5297 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5298 return Op;
5299
5300 // getSVESafeBitCast does not support casting between unpacked types.
5301 if (!isPackedVectorType(OpVT, DAG))
5302 return SDValue();
5303
5304 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5305 }
5306
5307 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5308 return SDValue();
5309
5310 // Bitcasts between f16 and bf16 are legal.
5311 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5312 return Op;
5313
5314 assert(ArgVT == MVT::i16);
5315 SDLoc DL(Op);
5316
5317 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5318 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5319 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5320}
5321
5322// Returns lane if Op extracts from a two-element vector and lane is constant
5323// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5324static std::optional<uint64_t>
5326 SDNode *OpNode = Op.getNode();
5327 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5328 return std::nullopt;
5329
5330 EVT VT = OpNode->getOperand(0).getValueType();
5331 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
5332 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5333 return std::nullopt;
5334
5335 return C->getZExtValue();
5336}
5337
5339 bool isSigned) {
5340 EVT VT = N.getValueType();
5341
5342 if (N.getOpcode() != ISD::BUILD_VECTOR)
5343 return false;
5344
5345 for (const SDValue &Elt : N->op_values()) {
5346 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5347 unsigned EltSize = VT.getScalarSizeInBits();
5348 unsigned HalfSize = EltSize / 2;
5349 if (isSigned) {
5350 if (!isIntN(HalfSize, C->getSExtValue()))
5351 return false;
5352 } else {
5353 if (!isUIntN(HalfSize, C->getZExtValue()))
5354 return false;
5355 }
5356 continue;
5357 }
5358 return false;
5359 }
5360
5361 return true;
5362}
5363
5365 EVT VT = N.getValueType();
5366 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5367 EVT HalfVT = EVT::getVectorVT(
5368 *DAG.getContext(),
5371 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5372}
5373
5375 return N.getOpcode() == ISD::SIGN_EXTEND ||
5376 N.getOpcode() == ISD::ANY_EXTEND ||
5377 isExtendedBUILD_VECTOR(N, DAG, true);
5378}
5379
5381 return N.getOpcode() == ISD::ZERO_EXTEND ||
5382 N.getOpcode() == ISD::ANY_EXTEND ||
5383 isExtendedBUILD_VECTOR(N, DAG, false);
5384}
5385
5387 unsigned Opcode = N.getOpcode();
5388 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5389 SDValue N0 = N.getOperand(0);
5390 SDValue N1 = N.getOperand(1);
5391 return N0->hasOneUse() && N1->hasOneUse() &&
5392 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5393 }
5394 return false;
5395}
5396
5398 unsigned Opcode = N.getOpcode();
5399 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5400 SDValue N0 = N.getOperand(0);
5401 SDValue N1 = N.getOperand(1);
5402 return N0->hasOneUse() && N1->hasOneUse() &&
5403 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5404 }
5405 return false;
5406}
5407
5408SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5409 SelectionDAG &DAG) const {
5410 // The rounding mode is in bits 23:22 of the FPSCR.
5411 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5412 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5413 // so that the shift + and get folded into a bitfield extract.
5414 SDLoc dl(Op);
5415
5416 SDValue Chain = Op.getOperand(0);
5417 SDValue FPCR_64 = DAG.getNode(
5418 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5419 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5420 Chain = FPCR_64.getValue(1);
5421 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5422 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5423 DAG.getConstant(1U << 22, dl, MVT::i32));
5424 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5425 DAG.getConstant(22, dl, MVT::i32));
5426 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5427 DAG.getConstant(3, dl, MVT::i32));
5428 return DAG.getMergeValues({AND, Chain}, dl);
5429}
5430
5431SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5432 SelectionDAG &DAG) const {
5433 SDLoc DL(Op);
5434 SDValue Chain = Op->getOperand(0);
5435 SDValue RMValue = Op->getOperand(1);
5436
5437 // The rounding mode is in bits 23:22 of the FPCR.
5438 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5439 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5440 // ((arg - 1) & 3) << 22).
5441 //
5442 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5443 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5444 // generated llvm.set.rounding to ensure this condition.
5445
5446 // Calculate new value of FPCR[23:22].
5447 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5448 DAG.getConstant(1, DL, MVT::i32));
5449 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5450 DAG.getConstant(0x3, DL, MVT::i32));
5451 RMValue =
5452 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5453 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5454 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5455
5456 // Get current value of FPCR.
5457 SDValue Ops[] = {
5458 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5459 SDValue FPCR =
5460 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5461 Chain = FPCR.getValue(1);
5462 FPCR = FPCR.getValue(0);
5463
5464 // Put new rounding mode into FPSCR[23:22].
5465 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5466 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5467 DAG.getConstant(RMMask, DL, MVT::i64));
5468 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5469 SDValue Ops2[] = {
5470 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5471 FPCR};
5472 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5473}
5474
5475SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5476 SelectionDAG &DAG) const {
5477 SDLoc DL(Op);
5478 SDValue Chain = Op->getOperand(0);
5479
5480 // Get current value of FPCR.
5481 SDValue Ops[] = {
5482 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5483 SDValue FPCR =
5484 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5485 Chain = FPCR.getValue(1);
5486 FPCR = FPCR.getValue(0);
5487
5488 // Truncate FPCR to 32 bits.
5489 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5490
5491 return DAG.getMergeValues({Result, Chain}, DL);
5492}
5493
5494SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5495 SelectionDAG &DAG) const {
5496 SDLoc DL(Op);
5497 SDValue Chain = Op->getOperand(0);
5498 SDValue Mode = Op->getOperand(1);
5499
5500 // Extend the specified value to 64 bits.
5501 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5502
5503 // Set new value of FPCR.
5504 SDValue Ops2[] = {
5505 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5506 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5507}
5508
5509SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5510 SelectionDAG &DAG) const {
5511 SDLoc DL(Op);
5512 SDValue Chain = Op->getOperand(0);
5513
5514 // Get current value of FPCR.
5515 SDValue Ops[] = {
5516 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5517 SDValue FPCR =
5518 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5519 Chain = FPCR.getValue(1);
5520 FPCR = FPCR.getValue(0);
5521
5522 // Clear bits that are not reserved.
5523 SDValue FPSCRMasked = DAG.getNode(
5524 ISD::AND, DL, MVT::i64, FPCR,
5526
5527 // Set new value of FPCR.
5528 SDValue Ops2[] = {Chain,
5529 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5530 FPSCRMasked};
5531 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5532}
5533
5534static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5535 SDLoc DL, bool &IsMLA) {
5536 bool IsN0SExt = isSignExtended(N0, DAG);
5537 bool IsN1SExt = isSignExtended(N1, DAG);
5538 if (IsN0SExt && IsN1SExt)
5539 return AArch64ISD::SMULL;
5540
5541 bool IsN0ZExt = isZeroExtended(N0, DAG);
5542 bool IsN1ZExt = isZeroExtended(N1, DAG);
5543
5544 if (IsN0ZExt && IsN1ZExt)
5545 return AArch64ISD::UMULL;
5546
5547 // Select UMULL if we can replace the other operand with an extend.
5548 EVT VT = N0.getValueType();
5549 unsigned EltSize = VT.getScalarSizeInBits();
5550 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5551 if (IsN0ZExt || IsN1ZExt) {
5552 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5553 return AArch64ISD::UMULL;
5554 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5555 DAG.MaskedValueIsZero(N1, Mask)) {
5556 // For v2i64 we look more aggresively at both operands being zero, to avoid
5557 // scalarization.
5558 return AArch64ISD::UMULL;
5559 }
5560
5561 if (IsN0SExt || IsN1SExt) {
5562 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5563 return AArch64ISD::SMULL;
5564 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5565 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5566 return AArch64ISD::SMULL;
5567 }
5568
5569 if (!IsN1SExt && !IsN1ZExt)
5570 return 0;
5571
5572 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5573 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5574 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5575 IsMLA = true;
5576 return AArch64ISD::SMULL;
5577 }
5578 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5579 IsMLA = true;
5580 return AArch64ISD::UMULL;
5581 }
5582 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5583 std::swap(N0, N1);
5584 IsMLA = true;
5585 return AArch64ISD::UMULL;
5586 }
5587 return 0;
5588}
5589
5590SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5591 EVT VT = Op.getValueType();
5592
5593 bool OverrideNEON = !Subtarget->isNeonAvailable();
5594 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5595 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5596
5597 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5598 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5599 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5600 "unexpected type for custom-lowering ISD::MUL");
5601 SDValue N0 = Op.getOperand(0);
5602 SDValue N1 = Op.getOperand(1);
5603 bool isMLA = false;
5604 EVT OVT = VT;
5605 if (VT.is64BitVector()) {
5606 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5607 isNullConstant(N0.getOperand(1)) &&
5609 isNullConstant(N1.getOperand(1))) {
5610 N0 = N0.getOperand(0);
5611 N1 = N1.getOperand(0);
5612 VT = N0.getValueType();
5613 } else {
5614 if (VT == MVT::v1i64) {
5615 if (Subtarget->hasSVE())
5616 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5617 // Fall through to expand this. It is not legal.
5618 return SDValue();
5619 } else
5620 // Other vector multiplications are legal.
5621 return Op;
5622 }
5623 }
5624
5625 SDLoc DL(Op);
5626 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5627
5628 if (!NewOpc) {
5629 if (VT.getVectorElementType() == MVT::i64) {
5630 // If SVE is available then i64 vector multiplications can also be made
5631 // legal.
5632 if (Subtarget->hasSVE())
5633 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5634 // Fall through to expand this. It is not legal.
5635 return SDValue();
5636 } else
5637 // Other vector multiplications are legal.
5638 return Op;
5639 }
5640
5641 // Legalize to a S/UMULL instruction
5642 SDValue Op0;
5643 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5644 if (!isMLA) {
5645 Op0 = skipExtensionForVectorMULL(N0, DAG);
5647 Op1.getValueType().is64BitVector() &&
5648 "unexpected types for extended operands to VMULL");
5649 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5650 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5651 DAG.getConstant(0, DL, MVT::i64));
5652 }
5653 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5654 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5655 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5658 EVT Op1VT = Op1.getValueType();
5659 return DAG.getNode(
5661 DAG.getNode(N0.getOpcode(), DL, VT,
5662 DAG.getNode(NewOpc, DL, VT,
5663 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5664 DAG.getNode(NewOpc, DL, VT,
5665 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5666 DAG.getConstant(0, DL, MVT::i64));
5667}
5668
5669static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5670 int Pattern) {
5671 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5672 return DAG.getConstant(1, DL, MVT::nxv1i1);
5673 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5674 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5675}
5676
5678 bool IsSigned, bool IsEqual) {
5679 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5680 !isa<ConstantSDNode>(Op.getOperand(2)))
5681 return SDValue();
5682
5683 SDLoc dl(Op);
5684 APInt X = Op.getConstantOperandAPInt(1);
5685 APInt Y = Op.getConstantOperandAPInt(2);
5686
5687 // When the second operand is the maximum value, comparisons that include
5688 // equality can never fail and thus we can return an all active predicate.
5689 if (IsEqual)
5690 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5691 return DAG.getConstant(1, dl, Op.getValueType());
5692
5693 bool Overflow;
5694 APInt NumActiveElems =
5695 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5696
5697 if (Overflow)
5698 return SDValue();
5699
5700 if (IsEqual) {
5701 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5702 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5703 : NumActiveElems.uadd_ov(One, Overflow);
5704 if (Overflow)
5705 return SDValue();
5706 }
5707
5708 std::optional<unsigned> PredPattern =
5710 unsigned MinSVEVectorSize = std::max(
5712 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5713 if (PredPattern != std::nullopt &&
5714 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5715 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5716
5717 return SDValue();
5718}
5719
5720// Returns a safe bitcast between two scalable vector predicates, where
5721// any newly created lanes from a widening bitcast are defined as zero.
5723 SDLoc DL(Op);
5724 EVT InVT = Op.getValueType();
5725
5726 assert(InVT.getVectorElementType() == MVT::i1 &&
5727 VT.getVectorElementType() == MVT::i1 &&
5728 "Expected a predicate-to-predicate bitcast");
5730 InVT.isScalableVector() &&
5731 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5732 "Only expect to cast between legal scalable predicate types!");
5733
5734 // Return the operand if the cast isn't changing type,
5735 if (InVT == VT)
5736 return Op;
5737
5738 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5739 // than VT. This will increase the chances of removing casts that introduce
5740 // new lanes, which have to be explicitly zero'd.
5741 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5742 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5743 Op.getOperand(1).getValueType().bitsGT(VT))
5744 Op = Op.getOperand(1);
5745
5746 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5747
5748 // We only have to zero the lanes if new lanes are being defined, e.g. when
5749 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5750 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5751 // we can return here.
5752 if (InVT.bitsGT(VT))
5753 return Reinterpret;
5754
5755 // Check if the other lanes are already known to be zeroed by
5756 // construction.
5758 return Reinterpret;
5759
5760 // Zero the newly introduced lanes.
5761 SDValue Mask = DAG.getConstant(1, DL, InVT);
5762 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5763 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5764}
5765
5766SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5767 SDValue Chain, SDLoc DL,
5768 EVT VT) const {
5769 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5771 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5772 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5775 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5777 RetTy, Callee, std::move(Args));
5778 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5779 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5780 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5781 Mask);
5782}
5783
5784// Lower an SME LDR/STR ZA intrinsic
5785// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5786// folded into the instruction
5787// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5788// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5789// and tile slice registers
5790// ldr(%tileslice, %ptr, %vecnum)
5791// ->
5792// %svl = rdsvl
5793// %ptr2 = %ptr + %svl * %vecnum
5794// %tileslice2 = %tileslice + %vecnum
5795// ldr [%tileslice2, 0], [%ptr2, 0]
5796// Case 3: If the vecnum is an immediate out of range, then the same is done as
5797// case 2, but the base and slice registers are modified by the greatest
5798// multiple of 15 lower than the vecnum and the remainder is folded into the
5799// instruction. This means that successive loads and stores that are offset from
5800// each other can share the same base and slice register updates.
5801// ldr(%tileslice, %ptr, 22)
5802// ldr(%tileslice, %ptr, 23)
5803// ->
5804// %svl = rdsvl
5805// %ptr2 = %ptr + %svl * 15
5806// %tileslice2 = %tileslice + 15
5807// ldr [%tileslice2, 7], [%ptr2, 7]
5808// ldr [%tileslice2, 8], [%ptr2, 8]
5809// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5810// operand and the immediate can be folded into the instruction, like case 2.
5811// ldr(%tileslice, %ptr, %vecnum + 7)
5812// ldr(%tileslice, %ptr, %vecnum + 8)
5813// ->
5814// %svl = rdsvl
5815// %ptr2 = %ptr + %svl * %vecnum
5816// %tileslice2 = %tileslice + %vecnum
5817// ldr [%tileslice2, 7], [%ptr2, 7]
5818// ldr [%tileslice2, 8], [%ptr2, 8]
5819// Case 5: The vecnum being an add of an immediate out of range is also handled,
5820// in which case the same remainder logic as case 3 is used.
5822 SDLoc DL(N);
5823
5824 SDValue TileSlice = N->getOperand(2);
5825 SDValue Base = N->getOperand(3);
5826 SDValue VecNum = N->getOperand(4);
5827 int32_t ConstAddend = 0;
5828 SDValue VarAddend = VecNum;
5829
5830 // If the vnum is an add of an immediate, we can fold it into the instruction
5831 if (VecNum.getOpcode() == ISD::ADD &&
5832 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5833 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5834 VarAddend = VecNum.getOperand(0);
5835 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5836 ConstAddend = ImmNode->getSExtValue();
5837 VarAddend = SDValue();
5838 }
5839
5840 int32_t ImmAddend = ConstAddend % 16;
5841 if (int32_t C = (ConstAddend - ImmAddend)) {
5842 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5843 VarAddend = VarAddend
5844 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5845 : CVal;
5846 }
5847
5848 if (VarAddend) {
5849 // Get the vector length that will be multiplied by vnum
5850 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5851 DAG.getConstant(1, DL, MVT::i32));
5852
5853 // Multiply SVL and vnum then add it to the base
5854 SDValue Mul = DAG.getNode(
5855 ISD::MUL, DL, MVT::i64,
5856 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5857 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5858 // Just add vnum to the tileslice
5859 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5860 }
5861
5863 DL, MVT::Other,
5864 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5865 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5866}
5867
5869 SDLoc dl(Op);
5870 SDValue ID =
5871 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, dl, MVT::i64);
5872
5873 auto Op1 = Op.getOperand(1);
5874 auto Op2 = Op.getOperand(2);
5875 auto Mask = Op.getOperand(3);
5876
5877 EVT Op1VT = Op1.getValueType();
5878 EVT Op2VT = Op2.getValueType();
5879 EVT ResVT = Op.getValueType();
5880
5881 assert((Op1VT.getVectorElementType() == MVT::i8 ||
5882 Op1VT.getVectorElementType() == MVT::i16) &&
5883 "Expected 8-bit or 16-bit characters.");
5884
5885 // Scalable vector type used to wrap operands.
5886 // A single container is enough for both operands because ultimately the
5887 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5888 EVT OpContainerVT = Op1VT.isScalableVector()
5889 ? Op1VT
5891
5892 if (Op2VT.is128BitVector()) {
5893 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5894 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
5895 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
5896 if (ResVT.isScalableVector())
5897 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, dl, OpContainerVT, Op2,
5898 DAG.getTargetConstant(0, dl, MVT::i64));
5899 } else {
5900 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
5901 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5902 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
5903 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
5904 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
5905 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op2IntVT, Op2,
5906 DAG.getConstant(0, dl, MVT::i64));
5907 Op2 = DAG.getSplatVector(Op2PromotedVT, dl, Op2);
5908 Op2 = DAG.getBitcast(OpContainerVT, Op2);
5909 }
5910
5911 // If the result is scalable, we just need to carry out the MATCH.
5912 if (ResVT.isScalableVector())
5913 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResVT, ID, Mask, Op1, Op2);
5914
5915 // If the result is fixed, we can still use MATCH but we need to wrap the
5916 // first operand and the mask in scalable vectors before doing so.
5917
5918 // Wrap the operands.
5919 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
5920 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, Op1VT, Mask);
5921 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5922
5923 // Carry out the match.
5924 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Mask.getValueType(),
5925 ID, Mask, Op1, Op2);
5926
5927 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5928 // (v16i8/v8i8).
5929 Match = DAG.getNode(ISD::SIGN_EXTEND, dl, OpContainerVT, Match);
5930 Match = convertFromScalableVector(DAG, Op1VT, Match);
5931 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Match);
5932}
5933
5934SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5935 SelectionDAG &DAG) const {
5936 unsigned IntNo = Op.getConstantOperandVal(1);
5937 SDLoc DL(Op);
5938 switch (IntNo) {
5939 default:
5940 return SDValue(); // Don't custom lower most intrinsics.
5941 case Intrinsic::aarch64_prefetch: {
5942 SDValue Chain = Op.getOperand(0);
5943 SDValue Addr = Op.getOperand(2);
5944
5945 unsigned IsWrite = Op.getConstantOperandVal(3);
5946 unsigned Locality = Op.getConstantOperandVal(4);
5947 unsigned IsStream = Op.getConstantOperandVal(5);
5948 unsigned IsData = Op.getConstantOperandVal(6);
5949 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5950 (!IsData << 3) | // IsDataCache bit
5951 (Locality << 1) | // Cache level bits
5952 (unsigned)IsStream; // Stream bit
5953
5954 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5955 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5956 }
5957 case Intrinsic::aarch64_sme_str:
5958 case Intrinsic::aarch64_sme_ldr: {
5959 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5960 }
5961 case Intrinsic::aarch64_sme_za_enable:
5962 return DAG.getNode(
5963 AArch64ISD::SMSTART, DL, MVT::Other,
5964 Op->getOperand(0), // Chain
5965 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5966 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5967 case Intrinsic::aarch64_sme_za_disable:
5968 return DAG.getNode(
5969 AArch64ISD::SMSTOP, DL, MVT::Other,
5970 Op->getOperand(0), // Chain
5971 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5972 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5973 }
5974}
5975
5976SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5977 SelectionDAG &DAG) const {
5978 unsigned IntNo = Op.getConstantOperandVal(1);
5979 SDLoc DL(Op);
5980 switch (IntNo) {
5981 default:
5982 return SDValue(); // Don't custom lower most intrinsics.
5983 case Intrinsic::aarch64_mops_memset_tag: {
5984 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5985 SDValue Chain = Node->getChain();
5986 SDValue Dst = Op.getOperand(2);
5987 SDValue Val = Op.getOperand(3);
5988 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5989 SDValue Size = Op.getOperand(4);
5990 auto Alignment = Node->getMemOperand()->getAlign();
5991 bool IsVol = Node->isVolatile();
5992 auto DstPtrInfo = Node->getPointerInfo();
5993
5994 const auto &SDI =
5995 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5996 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
5997 Chain, Dst, Val, Size, Alignment, IsVol,
5998 DstPtrInfo, MachinePointerInfo{});
5999
6000 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6001 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6002 // LowerOperationWrapper will complain that the number of results has
6003 // changed.
6004 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6005 }
6006 }
6007}
6008
6009SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6010 SelectionDAG &DAG) const {
6011 unsigned IntNo = Op.getConstantOperandVal(0);
6012 SDLoc dl(Op);
6013 switch (IntNo) {
6014 default: return SDValue(); // Don't custom lower most intrinsics.
6015 case Intrinsic::thread_pointer: {
6016 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6017 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
6018 }
6019 case Intrinsic::aarch64_neon_abs: {
6020 EVT Ty = Op.getValueType();
6021 if (Ty == MVT::i64) {
6022 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
6023 Op.getOperand(1));
6024 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
6025 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
6026 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6027 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
6028 } else {
6029 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
6030 }
6031 }
6032 case Intrinsic::aarch64_neon_pmull64: {
6033 SDValue LHS = Op.getOperand(1);
6034 SDValue RHS = Op.getOperand(2);
6035
6036 std::optional<uint64_t> LHSLane =
6038 std::optional<uint64_t> RHSLane =
6040
6041 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6042 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6043
6044 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6045 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6046 // which ISel recognizes better. For example, generate a ldr into d*
6047 // registers as opposed to a GPR load followed by a fmov.
6048 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6049 std::optional<uint64_t> OtherLane,
6050 const SDLoc &dl,
6051 SelectionDAG &DAG) -> SDValue {
6052 // If the operand is an higher half itself, rewrite it to
6053 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6054 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6055 if (NLane && *NLane == 1)
6056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6057 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
6058
6059 // Operand N is not a higher half but the other operand is.
6060 if (OtherLane && *OtherLane == 1) {
6061 // If this operand is a lower half, rewrite it to
6062 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6063 // align lanes of two operands. A roundtrip sequence (to move from lane
6064 // 1 to lane 0) is like this:
6065 // mov x8, v0.d[1]
6066 // fmov d0, x8
6067 if (NLane && *NLane == 0)
6068 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
6069 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
6070 N.getOperand(0),
6071 DAG.getConstant(0, dl, MVT::i64)),
6072 DAG.getConstant(1, dl, MVT::i64));
6073
6074 // Otherwise just dup from main to all lanes.
6075 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
6076 }
6077
6078 // Neither operand is an extract of higher half, so codegen may just use
6079 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6080 assert(N.getValueType() == MVT::i64 &&
6081 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6082 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
6083 };
6084
6085 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
6086 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
6087
6088 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
6089 }
6090 case Intrinsic::aarch64_neon_smax:
6091 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
6092 Op.getOperand(1), Op.getOperand(2));
6093 case Intrinsic::aarch64_neon_umax:
6094 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
6095 Op.getOperand(1), Op.getOperand(2));
6096 case Intrinsic::aarch64_neon_smin:
6097 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
6098 Op.getOperand(1), Op.getOperand(2));
6099 case Intrinsic::aarch64_neon_umin:
6100 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
6101 Op.getOperand(1), Op.getOperand(2));
6102 case Intrinsic::aarch64_neon_scalar_sqxtn:
6103 case Intrinsic::aarch64_neon_scalar_sqxtun:
6104 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6105 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6106 if (Op.getValueType() == MVT::i32)
6107 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6108 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
6109 Op.getOperand(0),
6110 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
6111 Op.getOperand(1))));
6112 return SDValue();
6113 }
6114 case Intrinsic::aarch64_neon_sqxtn:
6115 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6116 Op.getOperand(1));
6117 case Intrinsic::aarch64_neon_sqxtun:
6118 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6119 Op.getOperand(1));
6120 case Intrinsic::aarch64_neon_uqxtn:
6121 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6122 Op.getOperand(1));
6123 case Intrinsic::aarch64_neon_sqshrn:
6124 if (Op.getValueType().isVector())
6125 return DAG.getNode(ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6126 DAG.getNode(AArch64ISD::VASHR, dl,
6127 Op.getOperand(1).getValueType(),
6128 Op.getOperand(1), Op.getOperand(2)));
6129 return SDValue();
6130 case Intrinsic::aarch64_neon_sqshrun:
6131 if (Op.getValueType().isVector())
6132 return DAG.getNode(ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6133 DAG.getNode(AArch64ISD::VASHR, dl,
6134 Op.getOperand(1).getValueType(),
6135 Op.getOperand(1), Op.getOperand(2)));
6136 return SDValue();
6137 case Intrinsic::aarch64_neon_uqshrn:
6138 if (Op.getValueType().isVector())
6139 return DAG.getNode(ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6140 DAG.getNode(AArch64ISD::VLSHR, dl,
6141 Op.getOperand(1).getValueType(),
6142 Op.getOperand(1), Op.getOperand(2)));
6143 return SDValue();
6144 case Intrinsic::aarch64_neon_sqrshrn:
6145 if (Op.getValueType().isVector())
6146 return DAG.getNode(
6147 ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
6148 DAG.getNode(
6149 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6150 Op.getOperand(1), Op.getOperand(2)));
6151 return SDValue();
6152 case Intrinsic::aarch64_neon_sqrshrun:
6153 if (Op.getValueType().isVector())
6154 return DAG.getNode(
6155 ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
6156 DAG.getNode(
6157 AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
6158 Op.getOperand(1), Op.getOperand(2)));
6159 return SDValue();
6160 case Intrinsic::aarch64_neon_uqrshrn:
6161 if (Op.getValueType().isVector())
6162 return DAG.getNode(
6163 ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
6164 DAG.getNode(
6165 AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
6166 return SDValue();
6167 case Intrinsic::aarch64_sve_whilelo:
6168 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6169 /*IsEqual=*/false);
6170 case Intrinsic::aarch64_sve_whilelt:
6171 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6172 /*IsEqual=*/false);
6173 case Intrinsic::aarch64_sve_whilels:
6174 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
6175 /*IsEqual=*/true);
6176 case Intrinsic::aarch64_sve_whilele:
6177 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
6178 /*IsEqual=*/true);
6179 case Intrinsic::aarch64_sve_sunpkhi:
6180 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
6181 Op.getOperand(1));
6182 case Intrinsic::aarch64_sve_sunpklo:
6183 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
6184 Op.getOperand(1));
6185 case Intrinsic::aarch64_sve_uunpkhi:
6186 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
6187 Op.getOperand(1));
6188 case Intrinsic::aarch64_sve_uunpklo:
6189 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
6190 Op.getOperand(1));
6191 case Intrinsic::aarch64_sve_clasta_n:
6192 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
6193 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6194 case Intrinsic::aarch64_sve_clastb_n:
6195 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
6196 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6197 case Intrinsic::aarch64_sve_lasta:
6198 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
6199 Op.getOperand(1), Op.getOperand(2));
6200 case Intrinsic::aarch64_sve_lastb:
6201 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
6202 Op.getOperand(1), Op.getOperand(2));
6203 case Intrinsic::aarch64_sve_rev:
6204 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
6205 Op.getOperand(1));
6206 case Intrinsic::aarch64_sve_tbl:
6207 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
6208 Op.getOperand(1), Op.getOperand(2));
6209 case Intrinsic::aarch64_sve_trn1:
6210 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
6211 Op.getOperand(1), Op.getOperand(2));
6212 case Intrinsic::aarch64_sve_trn2:
6213 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
6214 Op.getOperand(1), Op.getOperand(2));
6215 case Intrinsic::aarch64_sve_uzp1:
6216 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
6217 Op.getOperand(1), Op.getOperand(2));
6218 case Intrinsic::aarch64_sve_uzp2:
6219 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
6220 Op.getOperand(1), Op.getOperand(2));
6221 case Intrinsic::aarch64_sve_zip1:
6222 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
6223 Op.getOperand(1), Op.getOperand(2));
6224 case Intrinsic::aarch64_sve_zip2:
6225 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
6226 Op.getOperand(1), Op.getOperand(2));
6227 case Intrinsic::aarch64_sve_splice:
6228 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
6229 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6230 case Intrinsic::aarch64_sve_ptrue:
6231 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
6232 case Intrinsic::aarch64_sve_clz:
6233 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
6234 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6235 case Intrinsic::aarch64_sme_cntsb:
6236 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6237 DAG.getConstant(1, dl, MVT::i32));
6238 case Intrinsic::aarch64_sme_cntsh: {
6239 SDValue One = DAG.getConstant(1, dl, MVT::i32);
6240 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
6241 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
6242 }
6243 case Intrinsic::aarch64_sme_cntsw: {
6244 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6245 DAG.getConstant(1, dl, MVT::i32));
6246 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6247 DAG.getConstant(2, dl, MVT::i32));
6248 }
6249 case Intrinsic::aarch64_sme_cntsd: {
6250 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
6251 DAG.getConstant(1, dl, MVT::i32));
6252 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
6253 DAG.getConstant(3, dl, MVT::i32));
6254 }
6255 case Intrinsic::aarch64_sve_cnt: {
6256 SDValue Data = Op.getOperand(3);
6257 // CTPOP only supports integer operands.
6258 if (Data.getValueType().isFloatingPoint())
6259 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
6260 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
6261 Op.getOperand(2), Data, Op.getOperand(1));
6262 }
6263 case Intrinsic::aarch64_sve_dupq_lane:
6264 return LowerDUPQLane(Op, DAG);
6265 case Intrinsic::aarch64_sve_convert_from_svbool:
6266 if (Op.getValueType() == MVT::aarch64svcount)
6267 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
6268 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6269 case Intrinsic::aarch64_sve_convert_to_svbool:
6270 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6271 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
6272 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6273 case Intrinsic::aarch64_sve_fneg:
6274 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6275 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6276 case Intrinsic::aarch64_sve_frintp:
6277 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
6278 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6279 case Intrinsic::aarch64_sve_frintm:
6280 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
6281 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6282 case Intrinsic::aarch64_sve_frinti:
6283 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6284 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6285 case Intrinsic::aarch64_sve_frintx:
6286 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
6287 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6288 case Intrinsic::aarch64_sve_frinta:
6289 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
6290 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6291 case Intrinsic::aarch64_sve_frintn:
6292 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
6293 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6294 case Intrinsic::aarch64_sve_frintz:
6295 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
6296 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6297 case Intrinsic::aarch64_sve_ucvtf:
6299 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6300 Op.getOperand(1));
6301 case Intrinsic::aarch64_sve_scvtf:
6303 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6304 Op.getOperand(1));
6305 case Intrinsic::aarch64_sve_fcvtzu:
6307 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6308 Op.getOperand(1));
6309 case Intrinsic::aarch64_sve_fcvtzs:
6311 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6312 Op.getOperand(1));
6313 case Intrinsic::aarch64_sve_fsqrt:
6314 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
6315 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6316 case Intrinsic::aarch64_sve_frecpx:
6317 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
6318 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6319 case Intrinsic::aarch64_sve_frecpe_x:
6320 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
6321 Op.getOperand(1));
6322 case Intrinsic::aarch64_sve_frecps_x:
6323 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
6324 Op.getOperand(1), Op.getOperand(2));
6325 case Intrinsic::aarch64_sve_frsqrte_x:
6326 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
6327 Op.getOperand(1));
6328 case Intrinsic::aarch64_sve_frsqrts_x:
6329 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
6330 Op.getOperand(1), Op.getOperand(2));
6331 case Intrinsic::aarch64_sve_fabs:
6332 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6333 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6334 case Intrinsic::aarch64_sve_abs:
6335 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
6336 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6337 case Intrinsic::aarch64_sve_neg:
6338 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
6339 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6340 case Intrinsic::aarch64_sve_insr: {
6341 SDValue Scalar = Op.getOperand(2);
6342 EVT ScalarTy = Scalar.getValueType();
6343 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6344 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
6345
6346 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
6347 Op.getOperand(1), Scalar);
6348 }
6349 case Intrinsic::aarch64_sve_rbit:
6351 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6352 Op.getOperand(1));
6353 case Intrinsic::aarch64_sve_revb:
6354 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
6355 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6356 case Intrinsic::aarch64_sve_revh:
6357 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
6358 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6359 case Intrinsic::aarch64_sve_revw:
6360 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
6361 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6362 case Intrinsic::aarch64_sve_revd:
6363 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
6364 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6365 case Intrinsic::aarch64_sve_sxtb:
6366 return DAG.getNode(
6368 Op.getOperand(2), Op.getOperand(3),
6369 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6370 Op.getOperand(1));
6371 case Intrinsic::aarch64_sve_sxth:
6372 return DAG.getNode(
6374 Op.getOperand(2), Op.getOperand(3),
6375 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6376 Op.getOperand(1));
6377 case Intrinsic::aarch64_sve_sxtw:
6378 return DAG.getNode(
6380 Op.getOperand(2), Op.getOperand(3),
6381 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6382 Op.getOperand(1));
6383 case Intrinsic::aarch64_sve_uxtb:
6384 return DAG.getNode(
6386 Op.getOperand(2), Op.getOperand(3),
6387 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6388 Op.getOperand(1));
6389 case Intrinsic::aarch64_sve_uxth:
6390 return DAG.getNode(
6392 Op.getOperand(2), Op.getOperand(3),
6393 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6394 Op.getOperand(1));
6395 case Intrinsic::aarch64_sve_uxtw:
6396 return DAG.getNode(
6398 Op.getOperand(2), Op.getOperand(3),
6399 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6400 Op.getOperand(1));
6401 case Intrinsic::localaddress: {
6402 const auto &MF = DAG.getMachineFunction();
6403 const auto *RegInfo = Subtarget->getRegisterInfo();
6404 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6405 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
6406 Op.getSimpleValueType());
6407 }
6408
6409 case Intrinsic::eh_recoverfp: {
6410 // FIXME: This needs to be implemented to correctly handle highly aligned
6411 // stack objects. For now we simply return the incoming FP. Refer D53541
6412 // for more details.
6413 SDValue FnOp = Op.getOperand(1);
6414 SDValue IncomingFPOp = Op.getOperand(2);
6415 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6416 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6417 if (!Fn)
6419 "llvm.eh.recoverfp must take a function as the first argument");
6420 return IncomingFPOp;
6421 }
6422
6423 case Intrinsic::aarch64_neon_vsri:
6424 case Intrinsic::aarch64_neon_vsli:
6425 case Intrinsic::aarch64_sve_sri:
6426 case Intrinsic::aarch64_sve_sli: {
6427 EVT Ty = Op.getValueType();
6428
6429 if (!Ty.isVector())
6430 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6431
6432 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6433
6434 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6435 IntNo == Intrinsic::aarch64_sve_sri;
6436 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6437 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
6438 Op.getOperand(3));
6439 }
6440
6441 case Intrinsic::aarch64_neon_srhadd:
6442 case Intrinsic::aarch64_neon_urhadd:
6443 case Intrinsic::aarch64_neon_shadd:
6444 case Intrinsic::aarch64_neon_uhadd: {
6445 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6446 IntNo == Intrinsic::aarch64_neon_shadd);
6447 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6448 IntNo == Intrinsic::aarch64_neon_urhadd);
6449 unsigned Opcode = IsSignedAdd
6450 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6451 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6452 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6453 Op.getOperand(2));
6454 }
6455 case Intrinsic::aarch64_neon_saddlp:
6456 case Intrinsic::aarch64_neon_uaddlp: {
6457 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6460 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
6461 }
6462 case Intrinsic::aarch64_neon_sdot:
6463 case Intrinsic::aarch64_neon_udot:
6464 case Intrinsic::aarch64_sve_sdot:
6465 case Intrinsic::aarch64_sve_udot: {
6466 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6467 IntNo == Intrinsic::aarch64_sve_udot)
6470 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6471 Op.getOperand(2), Op.getOperand(3));
6472 }
6473 case Intrinsic::aarch64_neon_usdot:
6474 case Intrinsic::aarch64_sve_usdot: {
6475 return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
6476 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6477 }
6478 case Intrinsic::get_active_lane_mask: {
6479 SDValue ID =
6480 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6481
6482 EVT VT = Op.getValueType();
6483 if (VT.isScalableVector())
6484 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6485 Op.getOperand(2));
6486
6487 // We can use the SVE whilelo instruction to lower this intrinsic by
6488 // creating the appropriate sequence of scalable vector operations and
6489 // then extracting a fixed-width subvector from the scalable vector.
6490
6491 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6492 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6493
6494 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6495 Op.getOperand(1), Op.getOperand(2));
6496 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6497 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6498 DAG.getVectorIdxConstant(0, dl));
6499 }
6500 case Intrinsic::aarch64_neon_saddlv:
6501 case Intrinsic::aarch64_neon_uaddlv: {
6502 EVT OpVT = Op.getOperand(1).getValueType();
6503 EVT ResVT = Op.getValueType();
6504 assert(
6505 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6506 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6507 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6508 "Unexpected aarch64_neon_u/saddlv type");
6509 (void)OpVT;
6510 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6511 SDValue ADDLV = DAG.getNode(
6512 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6514 dl, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6515 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6516 ISD::EXTRACT_VECTOR_ELT, dl, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6517 ADDLV, DAG.getConstant(0, dl, MVT::i64));
6518 return EXTRACT_VEC_ELT;
6519 }
6520 case Intrinsic::experimental_cttz_elts: {
6521 SDValue CttzOp = Op.getOperand(1);
6522 EVT VT = CttzOp.getValueType();
6523 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6524
6525 if (VT.isFixedLengthVector()) {
6526 // We can use SVE instructions to lower this intrinsic by first creating
6527 // an SVE predicate register mask from the fixed-width vector.
6528 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6529 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6530 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6531 }
6532
6533 SDValue NewCttzElts =
6534 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6535 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6536 }
6537 case Intrinsic::experimental_vector_match: {
6538 return LowerVectorMatch(Op, DAG);
6539 }
6540 }
6541}
6542
6543bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6544 if (VT.getVectorElementType() == MVT::i8 ||
6545 VT.getVectorElementType() == MVT::i16) {
6546 EltTy = MVT::i32;
6547 return true;
6548 }
6549 return false;
6550}
6551
6552bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6553 EVT DataVT) const {
6554 const EVT IndexVT = Extend.getOperand(0).getValueType();
6555 // SVE only supports implicit extension of 32-bit indices.
6556 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6557 return false;
6558
6559 // Indices cannot be smaller than the main data type.
6560 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6561 return false;
6562
6563 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6564 // element container type, which would violate the previous clause.
6565 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6566}
6567
6568bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6569 EVT ExtVT = ExtVal.getValueType();
6570 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6571 return false;
6572
6573 // It may be worth creating extending masked loads if there are multiple
6574 // masked loads using the same predicate. That way we'll end up creating
6575 // extending masked loads that may then get split by the legaliser. This
6576 // results in just one set of predicate unpacks at the start, instead of
6577 // multiple sets of vector unpacks after each load.
6578 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6579 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6580 // Disable extending masked loads for fixed-width for now, since the code
6581 // quality doesn't look great.
6582 if (!ExtVT.isScalableVector())
6583 return false;
6584
6585 unsigned NumExtMaskedLoads = 0;
6586 for (auto *U : Ld->getMask()->users())
6587 if (isa<MaskedLoadSDNode>(U))
6588 NumExtMaskedLoads++;
6589
6590 if (NumExtMaskedLoads <= 1)
6591 return false;
6592 }
6593 }
6594
6595 return true;
6596}
6597
6598unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6599 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6600 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6602 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6604 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6606 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6608 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6610 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6612 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6614 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6616 };
6617 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6618 return AddrModes.find(Key)->second;
6619}
6620
6621unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6622 switch (Opcode) {
6623 default:
6624 llvm_unreachable("unimplemented opcode");
6625 return Opcode;
6640 }
6641}
6642
6643SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6644 SelectionDAG &DAG) const {
6645 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6646
6647 SDLoc DL(Op);
6648 SDValue Chain = MGT->getChain();
6649 SDValue PassThru = MGT->getPassThru();
6650 SDValue Mask = MGT->getMask();
6651 SDValue BasePtr = MGT->getBasePtr();
6652 SDValue Index = MGT->getIndex();
6653 SDValue Scale = MGT->getScale();
6654 EVT VT = Op.getValueType();
6655 EVT MemVT = MGT->getMemoryVT();
6656 ISD::LoadExtType ExtType = MGT->getExtensionType();
6657 ISD::MemIndexType IndexType = MGT->getIndexType();
6658
6659 // SVE supports zero (and so undef) passthrough values only, everything else
6660 // must be handled manually by an explicit select on the load's output.
6661 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6662 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6663 SDValue Load =
6664 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6665 MGT->getMemOperand(), IndexType, ExtType);
6666 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6667 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6668 }
6669
6670 bool IsScaled = MGT->isIndexScaled();
6671 bool IsSigned = MGT->isIndexSigned();
6672
6673 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6674 // must be calculated before hand.
6675 uint64_t ScaleVal = Scale->getAsZExtVal();
6676 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6677 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6678 EVT IndexVT = Index.getValueType();
6679 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6680 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6681 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6682
6683 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6684 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6685 MGT->getMemOperand(), IndexType, ExtType);
6686 }
6687
6688 // Lower fixed length gather to a scalable equivalent.
6689 if (VT.isFixedLengthVector()) {
6690 assert(Subtarget->useSVEForFixedLengthVectors() &&
6691 "Cannot lower when not using SVE for fixed vectors!");
6692
6693 // NOTE: Handle floating-point as if integer then bitcast the result.
6695 MemVT = MemVT.changeVectorElementTypeToInteger();
6696
6697 // Find the smallest integer fixed length vector we can use for the gather.
6698 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6699 if (DataVT.getVectorElementType() == MVT::i64 ||
6700 Index.getValueType().getVectorElementType() == MVT::i64 ||
6701 Mask.getValueType().getVectorElementType() == MVT::i64)
6702 PromotedVT = VT.changeVectorElementType(MVT::i64);
6703
6704 // Promote vector operands except for passthrough, which we know is either
6705 // undef or zero, and thus best constructed directly.
6706 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6707 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6708 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6709
6710 // A promoted result type forces the need for an extending load.
6711 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6712 ExtType = ISD::EXTLOAD;
6713
6714 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6715
6716 // Convert fixed length vector operands to scalable.
6717 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6718 Index = convertToScalableVector(DAG, ContainerVT, Index);
6720 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6721 : DAG.getConstant(0, DL, ContainerVT);
6722
6723 // Emit equivalent scalable vector gather.
6724 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6725 SDValue Load =
6726 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6727 Ops, MGT->getMemOperand(), IndexType, ExtType);
6728
6729 // Extract fixed length data then convert to the required result type.
6730 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6731 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6732 if (VT.isFloatingPoint())
6733 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6734
6735 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6736 }
6737
6738 // Everything else is legal.
6739 return Op;
6740}
6741
6742SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6743 SelectionDAG &DAG) const {
6744 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6745
6746 SDLoc DL(Op);
6747 SDValue Chain = MSC->getChain();
6748 SDValue StoreVal = MSC->getValue();
6749 SDValue Mask = MSC->getMask();
6750 SDValue BasePtr = MSC->getBasePtr();
6751 SDValue Index = MSC->getIndex();
6752 SDValue Scale = MSC->getScale();
6753 EVT VT = StoreVal.getValueType();
6754 EVT MemVT = MSC->getMemoryVT();
6755 ISD::MemIndexType IndexType = MSC->getIndexType();
6756 bool Truncating = MSC->isTruncatingStore();
6757
6758 bool IsScaled = MSC->isIndexScaled();
6759 bool IsSigned = MSC->isIndexSigned();
6760
6761 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6762 // must be calculated before hand.
6763 uint64_t ScaleVal = Scale->getAsZExtVal();
6764 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6765 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6766 EVT IndexVT = Index.getValueType();
6767 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6768 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6769 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6770
6771 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6772 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6773 MSC->getMemOperand(), IndexType, Truncating);
6774 }
6775
6776 // Lower fixed length scatter to a scalable equivalent.
6777 if (VT.isFixedLengthVector()) {
6778 assert(Subtarget->useSVEForFixedLengthVectors() &&
6779 "Cannot lower when not using SVE for fixed vectors!");
6780
6781 // Once bitcast we treat floating-point scatters as if integer.
6782 if (VT.isFloatingPoint()) {
6784 MemVT = MemVT.changeVectorElementTypeToInteger();
6785 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6786 }
6787
6788 // Find the smallest integer fixed length vector we can use for the scatter.
6789 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6790 if (VT.getVectorElementType() == MVT::i64 ||
6791 Index.getValueType().getVectorElementType() == MVT::i64 ||
6792 Mask.getValueType().getVectorElementType() == MVT::i64)
6793 PromotedVT = VT.changeVectorElementType(MVT::i64);
6794
6795 // Promote vector operands.
6796 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6797 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6798 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6799 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6800
6801 // A promoted value type forces the need for a truncating store.
6802 if (PromotedVT != VT)
6803 Truncating = true;
6804
6805 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6806
6807 // Convert fixed length vector operands to scalable.
6808 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6809 Index = convertToScalableVector(DAG, ContainerVT, Index);
6811 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6812
6813 // Emit equivalent scalable vector scatter.
6814 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6815 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6816 MSC->getMemOperand(), IndexType, Truncating);
6817 }
6818
6819 // Everything else is legal.
6820 return Op;
6821}
6822
6823SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6824 SDLoc DL(Op);
6825 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6826 assert(LoadNode && "Expected custom lowering of a masked load node");
6827 EVT VT = Op->getValueType(0);
6828
6829 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6830 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6831
6832 SDValue PassThru = LoadNode->getPassThru();
6833 SDValue Mask = LoadNode->getMask();
6834
6835 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6836 return Op;
6837
6839 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6840 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6841 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6842 LoadNode->getExtensionType());
6843
6844 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6845
6846 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6847}
6848
6849// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6851 EVT VT, EVT MemVT,
6852 SelectionDAG &DAG) {
6853 assert(VT.isVector() && "VT should be a vector type");
6854 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6855
6856 SDValue Value = ST->getValue();
6857
6858 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6859 // the word lane which represent the v4i8 subvector. It optimizes the store
6860 // to:
6861 //
6862 // xtn v0.8b, v0.8h
6863 // str s0, [x0]
6864
6865 SDValue Undef = DAG.getUNDEF(MVT::i16);
6866 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6867 {Undef, Undef, Undef, Undef});
6868
6869 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6870 Value, UndefVec);
6871 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6872
6873 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6874 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6875 Trunc, DAG.getConstant(0, DL, MVT::i64));
6876
6877 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6878 ST->getBasePtr(), ST->getMemOperand());
6879}
6880
6881// Custom lowering for any store, vector or scalar and/or default or with
6882// a truncate operations. Currently only custom lower truncate operation
6883// from vector v4i16 to v4i8 or volatile stores of i128.
6884SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6885 SelectionDAG &DAG) const {
6886 SDLoc Dl(Op);
6887 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6888 assert (StoreNode && "Can only custom lower store nodes");
6889
6890 SDValue Value = StoreNode->getValue();
6891
6892 EVT VT = Value.getValueType();
6893 EVT MemVT = StoreNode->getMemoryVT();
6894
6895 if (VT.isVector()) {
6897 VT,
6898 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6899 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6900
6901 unsigned AS = StoreNode->getAddressSpace();
6902 Align Alignment = StoreNode->getAlign();
6903 if (Alignment < MemVT.getStoreSize() &&
6904 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6905 StoreNode->getMemOperand()->getFlags(),
6906 nullptr)) {
6907 return scalarizeVectorStore(StoreNode, DAG);
6908 }
6909
6910 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6911 MemVT == MVT::v4i8) {
6912 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6913 }
6914 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6915 // the custom lowering, as there are no un-paired non-temporal stores and
6916 // legalization will break up 256 bit inputs.
6918 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6919 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6920 (MemVT.getScalarSizeInBits() == 8u ||
6921 MemVT.getScalarSizeInBits() == 16u ||
6922 MemVT.getScalarSizeInBits() == 32u ||
6923 MemVT.getScalarSizeInBits() == 64u)) {
6924 SDValue Lo =
6927 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6928 SDValue Hi =
6931 StoreNode->getValue(),
6932 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6934 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6935 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6936 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6937 return Result;
6938 }
6939 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6940 return LowerStore128(Op, DAG);
6941 } else if (MemVT == MVT::i64x8) {
6942 SDValue Value = StoreNode->getValue();
6943 assert(Value->getValueType(0) == MVT::i64x8);
6944 SDValue Chain = StoreNode->getChain();
6945 SDValue Base = StoreNode->getBasePtr();
6946 EVT PtrVT = Base.getValueType();
6947 for (unsigned i = 0; i < 8; i++) {
6948 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6949 Value, DAG.getConstant(i, Dl, MVT::i32));
6950 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6951 DAG.getConstant(i * 8, Dl, PtrVT));
6952 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6953 StoreNode->getOriginalAlign());
6954 }
6955 return Chain;
6956 }
6957
6958 return SDValue();
6959}
6960
6961/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6962SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6963 SelectionDAG &DAG) const {
6964 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6965 assert(StoreNode->getMemoryVT() == MVT::i128);
6966 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6967
6968 bool IsStoreRelease =
6970 if (StoreNode->isAtomic())
6971 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6972 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6975
6976 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6977 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6978 ? StoreNode->getOperand(1)
6979 : StoreNode->getOperand(2);
6980 SDLoc DL(Op);
6981 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6982 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6983 if (DAG.getDataLayout().isBigEndian())
6984 std::swap(StoreValue.first, StoreValue.second);
6986 Opcode, DL, DAG.getVTList(MVT::Other),
6987 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6988 StoreNode->getBasePtr()},
6989 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6990 return Result;
6991}
6992
6993SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6994 SelectionDAG &DAG) const {
6995 SDLoc DL(Op);
6996 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6997 assert(LoadNode && "Expected custom lowering of a load node");
6998
6999 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7001 SDValue Base = LoadNode->getBasePtr();
7002 SDValue Chain = LoadNode->getChain();
7003 EVT PtrVT = Base.getValueType();
7004 for (unsigned i = 0; i < 8; i++) {
7005 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7006 DAG.getConstant(i * 8, DL, PtrVT));
7007 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
7008 LoadNode->getPointerInfo(),
7009 LoadNode->getOriginalAlign());
7010 Ops.push_back(Part);
7011 Chain = SDValue(Part.getNode(), 1);
7012 }
7013 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7014 return DAG.getMergeValues({Loaded, Chain}, DL);
7015 }
7016
7017 // Custom lowering for extending v4i8 vector loads.
7018 EVT VT = Op->getValueType(0);
7019 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7020
7021 if (LoadNode->getMemoryVT() != MVT::v4i8)
7022 return SDValue();
7023
7024 // Avoid generating unaligned loads.
7025 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7026 return SDValue();
7027
7028 unsigned ExtType;
7029 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7030 ExtType = ISD::SIGN_EXTEND;
7031 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7032 LoadNode->getExtensionType() == ISD::EXTLOAD)
7033 ExtType = ISD::ZERO_EXTEND;
7034 else
7035 return SDValue();
7036
7037 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7038 LoadNode->getBasePtr(), MachinePointerInfo());
7039 SDValue Chain = Load.getValue(1);
7040 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7041 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7042 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7043 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7044 DAG.getConstant(0, DL, MVT::i64));
7045 if (VT == MVT::v4i32)
7046 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7047 return DAG.getMergeValues({Ext, Chain}, DL);
7048}
7049
7050SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7051 SelectionDAG &DAG) const {
7052 SDLoc DL(Op);
7053 SDValue Vec = Op.getOperand(0);
7054 SDValue Mask = Op.getOperand(1);
7055 SDValue Passthru = Op.getOperand(2);
7056 EVT VecVT = Vec.getValueType();
7057 EVT MaskVT = Mask.getValueType();
7058 EVT ElmtVT = VecVT.getVectorElementType();
7059 const bool IsFixedLength = VecVT.isFixedLengthVector();
7060 const bool HasPassthru = !Passthru.isUndef();
7061 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7062 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7063
7064 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7065
7066 if (!Subtarget->isSVEAvailable())
7067 return SDValue();
7068
7069 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7070 return SDValue();
7071
7072 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7073 if (MinElmts != 2 && MinElmts != 4)
7074 return SDValue();
7075
7076 // We can use the SVE register containing the NEON vector in its lowest bits.
7077 if (IsFixedLength) {
7078 EVT ScalableVecVT =
7079 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7080 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7081 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7082
7083 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7084 DAG.getUNDEF(ScalableVecVT), Vec,
7085 DAG.getConstant(0, DL, MVT::i64));
7086 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7087 DAG.getUNDEF(ScalableMaskVT), Mask,
7088 DAG.getConstant(0, DL, MVT::i64));
7090 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7091 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7092 DAG.getUNDEF(ScalableVecVT), Passthru,
7093 DAG.getConstant(0, DL, MVT::i64));
7094
7095 VecVT = Vec.getValueType();
7096 MaskVT = Mask.getValueType();
7097 }
7098
7099 // Get legal type for compact instruction
7100 EVT ContainerVT = getSVEContainerType(VecVT);
7101 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7102
7103 // Convert to i32 or i64 for smaller types, as these are the only supported
7104 // sizes for compact.
7105 if (ContainerVT != VecVT) {
7106 Vec = DAG.getBitcast(CastVT, Vec);
7107 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7108 }
7109
7110 SDValue Compressed = DAG.getNode(
7112 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7113
7114 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7115 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7116 SDValue Offset = DAG.getNode(
7117 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7118 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7119
7120 SDValue IndexMask = DAG.getNode(
7121 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7122 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7123 DAG.getConstant(0, DL, MVT::i64), Offset);
7124
7125 Compressed =
7126 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7127 }
7128
7129 // Extracting from a legal SVE type before truncating produces better code.
7130 if (IsFixedLength) {
7131 Compressed = DAG.getNode(
7133 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7134 Compressed, DAG.getConstant(0, DL, MVT::i64));
7135 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7136 VecVT = FixedVecVT;
7137 }
7138
7139 // If we changed the element type before, we need to convert it back.
7140 if (ContainerVT != VecVT) {
7141 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7142 Compressed = DAG.getBitcast(VecVT, Compressed);
7143 }
7144
7145 return Compressed;
7146}
7147
7148// Generate SUBS and CSEL for integer abs.
7149SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7150 MVT VT = Op.getSimpleValueType();
7151
7152 if (VT.isVector())
7153 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7154
7155 SDLoc DL(Op);
7156 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7157 Op.getOperand(0));
7158 // Generate SUBS & CSEL.
7159 SDValue Cmp =
7160 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
7161 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7162 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7163 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
7164 Cmp.getValue(1));
7165}
7166
7168 SDValue Chain = Op.getOperand(0);
7169 SDValue Cond = Op.getOperand(1);
7170 SDValue Dest = Op.getOperand(2);
7171
7173 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7174 SDLoc dl(Op);
7175 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
7176 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7177 Cmp);
7178 }
7179
7180 return SDValue();
7181}
7182
7183// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7184// FSHL is converted to FSHR before deciding what to do with it
7186 SDValue Shifts = Op.getOperand(2);
7187 // Check if the shift amount is a constant
7188 // If opcode is FSHL, convert it to FSHR
7189 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7190 SDLoc DL(Op);
7191 MVT VT = Op.getSimpleValueType();
7192
7193 if (Op.getOpcode() == ISD::FSHL) {
7194 unsigned int NewShiftNo =
7195 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
7196 return DAG.getNode(
7197 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7198 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7199 } else if (Op.getOpcode() == ISD::FSHR) {
7200 return Op;
7201 }
7202 }
7203
7204 return SDValue();
7205}
7206
7208 SDValue X = Op.getOperand(0);
7209 EVT XScalarTy = X.getValueType();
7210 SDValue Exp = Op.getOperand(1);
7211
7212 SDLoc DL(Op);
7213 EVT XVT, ExpVT;
7214 switch (Op.getSimpleValueType().SimpleTy) {
7215 default:
7216 return SDValue();
7217 case MVT::bf16:
7218 case MVT::f16:
7219 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7220 [[fallthrough]];
7221 case MVT::f32:
7222 XVT = MVT::nxv4f32;
7223 ExpVT = MVT::nxv4i32;
7224 break;
7225 case MVT::f64:
7226 XVT = MVT::nxv2f64;
7227 ExpVT = MVT::nxv2i64;
7228 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7229 break;
7230 }
7231
7232 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7233 SDValue VX =
7234 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7235 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7236 DAG.getUNDEF(ExpVT), Exp, Zero);
7237 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7238 AArch64SVEPredPattern::all);
7239 SDValue FScale =
7241 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7242 VPg, VX, VExp);
7243 SDValue Final =
7244 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7245 if (X.getValueType() != XScalarTy)
7246 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7247 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7248 return Final;
7249}
7250
7251SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7252 SelectionDAG &DAG) const {
7253 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7254 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7256 "ADJUST_TRAMPOLINE operation is only supported on Linux.");
7257
7258 return Op.getOperand(0);
7259}
7260
7261SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7262 SelectionDAG &DAG) const {
7263
7264 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
7265 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7266 report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
7267
7268 SDValue Chain = Op.getOperand(0);
7269 SDValue Trmp = Op.getOperand(1); // trampoline
7270 SDValue FPtr = Op.getOperand(2); // nested function
7271 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7272 SDLoc dl(Op);
7273
7274 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7275 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
7276
7279
7280 Entry.Ty = IntPtrTy;
7281 Entry.Node = Trmp;
7282 Args.push_back(Entry);
7283
7284 if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
7286 MachineFrameInfo &MFI = MF.getFrameInfo();
7287 Entry.Node =
7288 DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
7289 } else
7290 Entry.Node = DAG.getConstant(36, dl, MVT::i64);
7291
7292 Args.push_back(Entry);
7293 Entry.Node = FPtr;
7294 Args.push_back(Entry);
7295 Entry.Node = Nest;
7296 Args.push_back(Entry);
7297
7298 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
7300 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
7302 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
7303
7304 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7305 return CallResult.second;
7306}
7307
7309 SelectionDAG &DAG) const {
7310 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7311 LLVM_DEBUG(Op.dump());
7312
7313 switch (Op.getOpcode()) {
7314 default:
7315 llvm_unreachable("unimplemented operand");
7316 return SDValue();
7317 case ISD::BITCAST:
7318 return LowerBITCAST(Op, DAG);
7319 case ISD::GlobalAddress:
7320 return LowerGlobalAddress(Op, DAG);
7322 return LowerGlobalTLSAddress(Op, DAG);
7324 return LowerPtrAuthGlobalAddress(Op, DAG);
7326 return LowerADJUST_TRAMPOLINE(Op, DAG);
7328 return LowerINIT_TRAMPOLINE(Op, DAG);
7329 case ISD::SETCC:
7330 case ISD::STRICT_FSETCC:
7332 return LowerSETCC(Op, DAG);
7333 case ISD::SETCCCARRY:
7334 return LowerSETCCCARRY(Op, DAG);
7335 case ISD::BRCOND:
7336 return LowerBRCOND(Op, DAG);
7337 case ISD::BR_CC:
7338 return LowerBR_CC(Op, DAG);
7339 case ISD::SELECT:
7340 return LowerSELECT(Op, DAG);
7341 case ISD::SELECT_CC:
7342 return LowerSELECT_CC(Op, DAG);
7343 case ISD::JumpTable:
7344 return LowerJumpTable(Op, DAG);
7345 case ISD::BR_JT:
7346 return LowerBR_JT(Op, DAG);
7347 case ISD::BRIND:
7348 return LowerBRIND(Op, DAG);
7349 case ISD::ConstantPool:
7350 return LowerConstantPool(Op, DAG);
7351 case ISD::BlockAddress:
7352 return LowerBlockAddress(Op, DAG);
7353 case ISD::VASTART:
7354 return LowerVASTART(Op, DAG);
7355 case ISD::VACOPY:
7356 return LowerVACOPY(Op, DAG);
7357 case ISD::VAARG:
7358 return LowerVAARG(Op, DAG);
7359 case ISD::UADDO_CARRY:
7360 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7361 case ISD::USUBO_CARRY:
7362 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7363 case ISD::SADDO_CARRY:
7364 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7365 case ISD::SSUBO_CARRY:
7366 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7367 case ISD::SADDO:
7368 case ISD::UADDO:
7369 case ISD::SSUBO:
7370 case ISD::USUBO:
7371 case ISD::SMULO:
7372 case ISD::UMULO:
7373 return LowerXALUO(Op, DAG);
7374 case ISD::FADD:
7375 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7376 case ISD::FSUB:
7377 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7378 case ISD::FMUL:
7379 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7380 case ISD::FMA:
7381 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7382 case ISD::FDIV:
7383 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7384 case ISD::FNEG:
7385 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7386 case ISD::FCEIL:
7387 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7388 case ISD::FFLOOR:
7389 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7390 case ISD::FNEARBYINT:
7391 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7392 case ISD::FRINT:
7393 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7394 case ISD::FROUND:
7395 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7396 case ISD::FROUNDEVEN:
7397 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7398 case ISD::FTRUNC:
7399 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7400 case ISD::FSQRT:
7401 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7402 case ISD::FABS:
7403 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7404 case ISD::FP_ROUND:
7406 return LowerFP_ROUND(Op, DAG);
7407 case ISD::FP_EXTEND:
7409 return LowerFP_EXTEND(Op, DAG);
7410 case ISD::FRAMEADDR:
7411 return LowerFRAMEADDR(Op, DAG);
7412 case ISD::SPONENTRY:
7413 return LowerSPONENTRY(Op, DAG);
7414 case ISD::RETURNADDR:
7415 return LowerRETURNADDR(Op, DAG);
7417 return LowerADDROFRETURNADDR(Op, DAG);
7419 return LowerCONCAT_VECTORS(Op, DAG);
7421 return LowerINSERT_VECTOR_ELT(Op, DAG);
7423 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7424 case ISD::BUILD_VECTOR:
7425 return LowerBUILD_VECTOR(Op, DAG);
7427 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7429 return LowerVECTOR_SHUFFLE(Op, DAG);
7430 case ISD::SPLAT_VECTOR:
7431 return LowerSPLAT_VECTOR(Op, DAG);
7433 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7435 return LowerINSERT_SUBVECTOR(Op, DAG);
7436 case ISD::SDIV:
7437 case ISD::UDIV:
7438 return LowerDIV(Op, DAG);
7439 case ISD::SMIN:
7440 case ISD::UMIN:
7441 case ISD::SMAX:
7442 case ISD::UMAX:
7443 return LowerMinMax(Op, DAG);
7444 case ISD::SRA:
7445 case ISD::SRL:
7446 case ISD::SHL:
7447 return LowerVectorSRA_SRL_SHL(Op, DAG);
7448 case ISD::SHL_PARTS:
7449 case ISD::SRL_PARTS:
7450 case ISD::SRA_PARTS:
7451 return LowerShiftParts(Op, DAG);
7452 case ISD::CTPOP:
7453 case ISD::PARITY:
7454 return LowerCTPOP_PARITY(Op, DAG);
7455 case ISD::FCOPYSIGN:
7456 return LowerFCOPYSIGN(Op, DAG);
7457 case ISD::OR:
7458 return LowerVectorOR(Op, DAG);
7459 case ISD::XOR:
7460 return LowerXOR(Op, DAG);
7461 case ISD::PREFETCH:
7462 return LowerPREFETCH(Op, DAG);
7463 case ISD::SINT_TO_FP:
7464 case ISD::UINT_TO_FP:
7467 return LowerINT_TO_FP(Op, DAG);
7468 case ISD::FP_TO_SINT:
7469 case ISD::FP_TO_UINT:
7472 return LowerFP_TO_INT(Op, DAG);
7475 return LowerFP_TO_INT_SAT(Op, DAG);
7476 case ISD::FSINCOS:
7477 return LowerFSINCOS(Op, DAG);
7478 case ISD::GET_ROUNDING:
7479 return LowerGET_ROUNDING(Op, DAG);
7480 case ISD::SET_ROUNDING:
7481 return LowerSET_ROUNDING(Op, DAG);
7482 case ISD::GET_FPMODE:
7483 return LowerGET_FPMODE(Op, DAG);
7484 case ISD::SET_FPMODE:
7485 return LowerSET_FPMODE(Op, DAG);
7486 case ISD::RESET_FPMODE:
7487 return LowerRESET_FPMODE(Op, DAG);
7488 case ISD::MUL:
7489 return LowerMUL(Op, DAG);
7490 case ISD::MULHS:
7491 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7492 case ISD::MULHU:
7493 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7495 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7497 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7499 return LowerINTRINSIC_VOID(Op, DAG);
7500 case ISD::ATOMIC_STORE:
7501 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7502 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7503 return LowerStore128(Op, DAG);
7504 }
7505 return SDValue();
7506 case ISD::STORE:
7507 return LowerSTORE(Op, DAG);
7508 case ISD::MSTORE:
7509 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7510 case ISD::MGATHER:
7511 return LowerMGATHER(Op, DAG);
7512 case ISD::MSCATTER:
7513 return LowerMSCATTER(Op, DAG);
7515 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7516 case ISD::VECREDUCE_ADD:
7517 case ISD::VECREDUCE_AND:
7518 case ISD::VECREDUCE_OR:
7519 case ISD::VECREDUCE_XOR:
7529 return LowerVECREDUCE(Op, DAG);
7531 return LowerATOMIC_LOAD_AND(Op, DAG);
7533 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7534 case ISD::VSCALE:
7535 return LowerVSCALE(Op, DAG);
7537 return LowerVECTOR_COMPRESS(Op, DAG);
7538 case ISD::ANY_EXTEND:
7539 case ISD::SIGN_EXTEND:
7540 case ISD::ZERO_EXTEND:
7541 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7543 // Only custom lower when ExtraVT has a legal byte based element type.
7544 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7545 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7546 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7547 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7548 return SDValue();
7549
7550 return LowerToPredicatedOp(Op, DAG,
7552 }
7553 case ISD::TRUNCATE:
7554 return LowerTRUNCATE(Op, DAG);
7555 case ISD::MLOAD:
7556 return LowerMLOAD(Op, DAG);
7557 case ISD::LOAD:
7558 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7559 !Subtarget->isNeonAvailable()))
7560 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7561 return LowerLOAD(Op, DAG);
7562 case ISD::ADD:
7563 case ISD::AND:
7564 case ISD::SUB:
7565 return LowerToScalableOp(Op, DAG);
7566 case ISD::FMAXIMUM:
7567 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7568 case ISD::FMAXNUM:
7569 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7570 case ISD::FMINIMUM:
7571 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7572 case ISD::FMINNUM:
7573 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7574 case ISD::VSELECT:
7575 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7576 case ISD::ABS:
7577 return LowerABS(Op, DAG);
7578 case ISD::ABDS:
7579 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7580 case ISD::ABDU:
7581 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7582 case ISD::AVGFLOORS:
7583 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7584 case ISD::AVGFLOORU:
7585 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7586 case ISD::AVGCEILS:
7587 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7588 case ISD::AVGCEILU:
7589 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7590 case ISD::BITREVERSE:
7591 return LowerBitreverse(Op, DAG);
7592 case ISD::BSWAP:
7593 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7594 case ISD::CTLZ:
7595 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7596 case ISD::CTTZ:
7597 return LowerCTTZ(Op, DAG);
7598 case ISD::VECTOR_SPLICE:
7599 return LowerVECTOR_SPLICE(Op, DAG);
7601 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7603 return LowerVECTOR_INTERLEAVE(Op, DAG);
7604 case ISD::LRINT:
7605 case ISD::LLRINT:
7606 if (Op.getValueType().isVector())
7607 return LowerVectorXRINT(Op, DAG);
7608 [[fallthrough]];
7609 case ISD::LROUND:
7610 case ISD::LLROUND: {
7611 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7612 Op.getOperand(0).getValueType() == MVT::bf16) &&
7613 "Expected custom lowering of rounding operations only for f16");
7614 SDLoc DL(Op);
7615 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7616 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7617 }
7618 case ISD::STRICT_LROUND:
7620 case ISD::STRICT_LRINT:
7621 case ISD::STRICT_LLRINT: {
7622 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7623 Op.getOperand(1).getValueType() == MVT::bf16) &&
7624 "Expected custom lowering of rounding operations only for f16");
7625 SDLoc DL(Op);
7626 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7627 {Op.getOperand(0), Op.getOperand(1)});
7628 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7629 {Ext.getValue(1), Ext.getValue(0)});
7630 }
7631 case ISD::WRITE_REGISTER: {
7632 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7633 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7634 SDLoc DL(Op);
7635
7636 SDValue Chain = Op.getOperand(0);
7637 SDValue SysRegName = Op.getOperand(1);
7638 std::pair<SDValue, SDValue> Pair =
7639 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7640
7641 // chain = MSRR(chain, sysregname, lo, hi)
7642 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7643 SysRegName, Pair.first, Pair.second);
7644
7645 return Result;
7646 }
7647 case ISD::FSHL:
7648 case ISD::FSHR:
7649 return LowerFunnelShift(Op, DAG);
7650 case ISD::FLDEXP:
7651 return LowerFLDEXP(Op, DAG);
7653 return LowerVECTOR_HISTOGRAM(Op, DAG);
7654 }
7655}
7656
7658 return !Subtarget->useSVEForFixedLengthVectors();
7659}
7660
7662 EVT VT, bool OverrideNEON) const {
7663 if (!VT.isFixedLengthVector() || !VT.isSimple())
7664 return false;
7665
7666 // Don't use SVE for vectors we cannot scalarize if required.
7667 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7668 // Fixed length predicates should be promoted to i8.
7669 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7670 case MVT::i1:
7671 default:
7672 return false;
7673 case MVT::i8:
7674 case MVT::i16:
7675 case MVT::i32:
7676 case MVT::i64:
7677 case MVT::f16:
7678 case MVT::f32:
7679 case MVT::f64:
7680 break;
7681 }
7682
7683 // NEON-sized vectors can be emulated using SVE instructions.
7684 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7685 return Subtarget->isSVEorStreamingSVEAvailable();
7686
7687 // Ensure NEON MVTs only belong to a single register class.
7688 if (VT.getFixedSizeInBits() <= 128)
7689 return false;
7690
7691 // Ensure wider than NEON code generation is enabled.
7692 if (!Subtarget->useSVEForFixedLengthVectors())
7693 return false;
7694
7695 // Don't use SVE for types that don't fit.
7696 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7697 return false;
7698
7699 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7700 // the base fixed length SVE support in place.
7701 if (!VT.isPow2VectorType())
7702 return false;
7703
7704 return true;
7705}
7706
7707//===----------------------------------------------------------------------===//
7708// Calling Convention Implementation
7709//===----------------------------------------------------------------------===//
7710
7711static unsigned getIntrinsicID(const SDNode *N) {
7712 unsigned Opcode = N->getOpcode();
7713 switch (Opcode) {
7714 default:
7717 unsigned IID = N->getConstantOperandVal(0);
7718 if (IID < Intrinsic::num_intrinsics)
7719 return IID;
7721 }
7722 }
7723}
7724
7726 SDValue N1) const {
7727 if (!N0.hasOneUse())
7728 return false;
7729
7730 unsigned IID = getIntrinsicID(N1.getNode());
7731 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7732 if (IID == Intrinsic::aarch64_neon_umull ||
7733 N1.getOpcode() == AArch64ISD::UMULL ||
7734 IID == Intrinsic::aarch64_neon_smull ||
7736 return N0.getOpcode() != ISD::ADD;
7737
7738 return true;
7739}
7740
7741/// Selects the correct CCAssignFn for a given CallingConvention value.
7743 bool IsVarArg) const {
7744 switch (CC) {
7745 default:
7746 report_fatal_error("Unsupported calling convention.");
7747 case CallingConv::GHC:
7748 return CC_AArch64_GHC;
7750 // The VarArg implementation makes assumptions about register
7751 // argument passing that do not hold for preserve_none, so we
7752 // instead fall back to C argument passing.
7753 // The non-vararg case is handled in the CC function itself.
7754 if (!IsVarArg)
7756 [[fallthrough]];
7757 case CallingConv::C:
7758 case CallingConv::Fast:
7762 case CallingConv::Swift:
7764 case CallingConv::Tail:
7765 case CallingConv::GRAAL:
7766 if (Subtarget->isTargetWindows()) {
7767 if (IsVarArg) {
7768 if (Subtarget->isWindowsArm64EC())
7771 }
7772 return CC_AArch64_Win64PCS;
7773 }
7774 if (!Subtarget->isTargetDarwin())
7775 return CC_AArch64_AAPCS;
7776 if (!IsVarArg)
7777 return CC_AArch64_DarwinPCS;
7780 case CallingConv::Win64:
7781 if (IsVarArg) {
7782 if (Subtarget->isWindowsArm64EC())
7785 }
7786 return CC_AArch64_Win64PCS;
7788 if (Subtarget->isWindowsArm64EC())
7796 return CC_AArch64_AAPCS;
7801 }
7802}
7803
7804CCAssignFn *
7806 switch (CC) {
7807 default:
7808 return RetCC_AArch64_AAPCS;
7812 if (Subtarget->isWindowsArm64EC())
7814 return RetCC_AArch64_AAPCS;
7815 }
7816}
7817
7818static bool isPassedInFPR(EVT VT) {
7819 return VT.isFixedLengthVector() ||
7820 (VT.isFloatingPoint() && !VT.isScalableVector());
7821}
7822
7823SDValue AArch64TargetLowering::LowerFormalArguments(
7824 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7825 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7826 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7828 const Function &F = MF.getFunction();
7829 MachineFrameInfo &MFI = MF.getFrameInfo();
7830 bool IsWin64 =
7831 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7832 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7833 (isVarArg && Subtarget->isWindowsArm64EC());
7835
7837 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7839 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7840 FuncInfo->setIsSVECC(true);
7841
7842 // Assign locations to all of the incoming arguments.
7844 DenseMap<unsigned, SDValue> CopiedRegs;
7845 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7846
7847 // At this point, Ins[].VT may already be promoted to i32. To correctly
7848 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7849 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7850 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7851 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7852 // LocVT.
7853 unsigned NumArgs = Ins.size();
7854 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7855 unsigned CurArgIdx = 0;
7856 for (unsigned i = 0; i != NumArgs; ++i) {
7857 MVT ValVT = Ins[i].VT;
7858 if (Ins[i].isOrigArg()) {
7859 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7860 CurArgIdx = Ins[i].getOrigArgIndex();
7861
7862 // Get type of the original argument.
7863 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7864 /*AllowUnknown*/ true);
7865 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7866 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7867 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7868 ValVT = MVT::i8;
7869 else if (ActualMVT == MVT::i16)
7870 ValVT = MVT::i16;
7871 }
7872 bool UseVarArgCC = false;
7873 if (IsWin64)
7874 UseVarArgCC = isVarArg;
7875 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7876 bool Res =
7877 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7878 assert(!Res && "Call operand has unhandled type");
7879 (void)Res;
7880 }
7881
7883 bool IsLocallyStreaming =
7884 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7885 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7886 SDValue Glue = Chain.getValue(1);
7887
7888 SmallVector<SDValue, 16> ArgValues;
7889 unsigned ExtraArgLocs = 0;
7890 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7891 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7892
7893 if (Ins[i].Flags.isByVal()) {
7894 // Byval is used for HFAs in the PCS, but the system should work in a
7895 // non-compliant manner for larger structs.
7896 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7897 int Size = Ins[i].Flags.getByValSize();
7898 unsigned NumRegs = (Size + 7) / 8;
7899
7900 // FIXME: This works on big-endian for composite byvals, which are the common
7901 // case. It should also work for fundamental types too.
7902 unsigned FrameIdx =
7903 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7904 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7905 InVals.push_back(FrameIdxN);
7906
7907 continue;
7908 }
7909
7910 if (Ins[i].Flags.isSwiftAsync())
7912
7913 SDValue ArgValue;
7914 if (VA.isRegLoc()) {
7915 // Arguments stored in registers.
7916 EVT RegVT = VA.getLocVT();
7917 const TargetRegisterClass *RC;
7918
7919 if (RegVT == MVT::i32)
7920 RC = &AArch64::GPR32RegClass;
7921 else if (RegVT == MVT::i64)
7922 RC = &AArch64::GPR64RegClass;
7923 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7924 RC = &AArch64::FPR16RegClass;
7925 else if (RegVT == MVT::f32)
7926 RC = &AArch64::FPR32RegClass;
7927 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7928 RC = &AArch64::FPR64RegClass;
7929 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7930 RC = &AArch64::FPR128RegClass;
7931 else if (RegVT.isScalableVector() &&
7932 RegVT.getVectorElementType() == MVT::i1) {
7933 FuncInfo->setIsSVECC(true);
7934 RC = &AArch64::PPRRegClass;
7935 } else if (RegVT == MVT::aarch64svcount) {
7936 FuncInfo->setIsSVECC(true);
7937 RC = &AArch64::PPRRegClass;
7938 } else if (RegVT.isScalableVector()) {
7939 FuncInfo->setIsSVECC(true);
7940 RC = &AArch64::ZPRRegClass;
7941 } else
7942 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7943
7944 // Transform the arguments in physical registers into virtual ones.
7945 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7946
7947 if (IsLocallyStreaming) {
7948 // LocallyStreamingFunctions must insert the SMSTART in the correct
7949 // position, so we use Glue to ensure no instructions can be scheduled
7950 // between the chain of:
7951 // t0: ch,glue = EntryNode
7952 // t1: res,ch,glue = CopyFromReg
7953 // ...
7954 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7955 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7956 // ^^^^^^
7957 // This will be the new Chain/Root node.
7958 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7959 Glue = ArgValue.getValue(2);
7960 if (isPassedInFPR(ArgValue.getValueType())) {
7961 ArgValue =
7963 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7964 {ArgValue, Glue});
7965 Glue = ArgValue.getValue(1);
7966 }
7967 } else
7968 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7969
7970 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7971 // to 64 bits. Insert an assert[sz]ext to capture this, then
7972 // truncate to the right size.
7973 switch (VA.getLocInfo()) {
7974 default:
7975 llvm_unreachable("Unknown loc info!");
7976 case CCValAssign::Full:
7977 break;
7979 assert(
7980 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7981 "Indirect arguments should be scalable on most subtargets");
7982 break;
7983 case CCValAssign::BCvt:
7984 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7985 break;
7986 case CCValAssign::AExt:
7987 case CCValAssign::SExt:
7988 case CCValAssign::ZExt:
7989 break;
7991 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7992 DAG.getConstant(32, DL, RegVT));
7993 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7994 break;
7995 }
7996 } else { // VA.isRegLoc()
7997 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7998 unsigned ArgOffset = VA.getLocMemOffset();
7999 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8000 ? VA.getLocVT().getSizeInBits()
8001 : VA.getValVT().getSizeInBits()) / 8;
8002
8003 uint32_t BEAlign = 0;
8004 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8005 !Ins[i].Flags.isInConsecutiveRegs())
8006 BEAlign = 8 - ArgSize;
8007
8008 SDValue FIN;
8009 MachinePointerInfo PtrInfo;
8010 if (StackViaX4) {
8011 // In both the ARM64EC varargs convention and the thunk convention,
8012 // arguments on the stack are accessed relative to x4, not sp. In
8013 // the thunk convention, there's an additional offset of 32 bytes
8014 // to account for the shadow store.
8015 unsigned ObjOffset = ArgOffset + BEAlign;
8016 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8017 ObjOffset += 32;
8018 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8019 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8020 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8021 DAG.getConstant(ObjOffset, DL, MVT::i64));
8023 } else {
8024 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8025
8026 // Create load nodes to retrieve arguments from the stack.
8027 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8028 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8029 }
8030
8031 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8033 MVT MemVT = VA.getValVT();
8034
8035 switch (VA.getLocInfo()) {
8036 default:
8037 break;
8038 case CCValAssign::Trunc:
8039 case CCValAssign::BCvt:
8040 MemVT = VA.getLocVT();
8041 break;
8044 Subtarget->isWindowsArm64EC()) &&
8045 "Indirect arguments should be scalable on most subtargets");
8046 MemVT = VA.getLocVT();
8047 break;
8048 case CCValAssign::SExt:
8049 ExtType = ISD::SEXTLOAD;
8050 break;
8051 case CCValAssign::ZExt:
8052 ExtType = ISD::ZEXTLOAD;
8053 break;
8054 case CCValAssign::AExt:
8055 ExtType = ISD::EXTLOAD;
8056 break;
8057 }
8058
8059 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8060 MemVT);
8061 }
8062
8063 if (VA.getLocInfo() == CCValAssign::Indirect) {
8064 assert((VA.getValVT().isScalableVT() ||
8065 Subtarget->isWindowsArm64EC()) &&
8066 "Indirect arguments should be scalable on most subtargets");
8067
8068 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8069 unsigned NumParts = 1;
8070 if (Ins[i].Flags.isInConsecutiveRegs()) {
8071 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8072 ++NumParts;
8073 }
8074
8075 MVT PartLoad = VA.getValVT();
8076 SDValue Ptr = ArgValue;
8077
8078 // Ensure we generate all loads for each tuple part, whilst updating the
8079 // pointer after each load correctly using vscale.
8080 while (NumParts > 0) {
8081 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8082 InVals.push_back(ArgValue);
8083 NumParts--;
8084 if (NumParts > 0) {
8085 SDValue BytesIncrement;
8086 if (PartLoad.isScalableVector()) {
8087 BytesIncrement = DAG.getVScale(
8088 DL, Ptr.getValueType(),
8089 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8090 } else {
8091 BytesIncrement = DAG.getConstant(
8092 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8093 Ptr.getValueType());
8094 }
8095 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8096 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8097 ExtraArgLocs++;
8098 i++;
8099 }
8100 }
8101 } else {
8102 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8103 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8104 ArgValue, DAG.getValueType(MVT::i32));
8105
8106 // i1 arguments are zero-extended to i8 by the caller. Emit a
8107 // hint to reflect this.
8108 if (Ins[i].isOrigArg()) {
8109 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8110 if (OrigArg->getType()->isIntegerTy(1)) {
8111 if (!Ins[i].Flags.isZExt()) {
8112 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8113 ArgValue.getValueType(), ArgValue);
8114 }
8115 }
8116 }
8117
8118 InVals.push_back(ArgValue);
8119 }
8120 }
8121 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8122
8123 // Insert the SMSTART if this is a locally streaming function and
8124 // make sure it is Glued to the last CopyFromReg value.
8125 if (IsLocallyStreaming) {
8126 SDValue PStateSM;
8127 if (Attrs.hasStreamingCompatibleInterface()) {
8128 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8131 FuncInfo->setPStateSMReg(Reg);
8132 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
8133 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8135 } else
8136 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8138
8139 // Ensure that the SMSTART happens after the CopyWithChain such that its
8140 // chain result is used.
8141 for (unsigned I=0; I<InVals.size(); ++I) {
8144 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8145 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8146 InVals[I].getValueType());
8147 }
8148 }
8149
8150 // varargs
8151 if (isVarArg) {
8152 if (!Subtarget->isTargetDarwin() || IsWin64) {
8153 // The AAPCS variadic function ABI is identical to the non-variadic
8154 // one. As a result there may be more arguments in registers and we should
8155 // save them for future reference.
8156 // Win64 variadic functions also pass arguments in registers, but all float
8157 // arguments are passed in integer registers.
8158 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8159 }
8160
8161 // This will point to the next argument passed via stack.
8162 unsigned VarArgsOffset = CCInfo.getStackSize();
8163 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8164 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8165 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8166 FuncInfo->setVarArgsStackIndex(
8167 MFI.CreateFixedObject(4, VarArgsOffset, true));
8168
8169 if (MFI.hasMustTailInVarArgFunc()) {
8170 SmallVector<MVT, 2> RegParmTypes;
8171 RegParmTypes.push_back(MVT::i64);
8172 RegParmTypes.push_back(MVT::f128);
8173 // Compute the set of forwarded registers. The rest are scratch.
8175 FuncInfo->getForwardedMustTailRegParms();
8176 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8178
8179 // Conservatively forward X8, since it might be used for aggregate return.
8180 if (!CCInfo.isAllocated(AArch64::X8)) {
8181 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8182 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8183 }
8184 }
8185 }
8186
8187 // On Windows, InReg pointers must be returned, so record the pointer in a
8188 // virtual register at the start of the function so it can be returned in the
8189 // epilogue.
8190 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8191 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8192 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8193 Ins[I].Flags.isInReg()) &&
8194 Ins[I].Flags.isSRet()) {
8195 assert(!FuncInfo->getSRetReturnReg());
8196
8197 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8198 Register Reg =
8200 FuncInfo->setSRetReturnReg(Reg);
8201
8202 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8203 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8204 break;
8205 }
8206 }
8207 }
8208
8209 unsigned StackArgSize = CCInfo.getStackSize();
8210 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8211 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8212 // This is a non-standard ABI so by fiat I say we're allowed to make full
8213 // use of the stack area to be popped, which must be aligned to 16 bytes in
8214 // any case:
8215 StackArgSize = alignTo(StackArgSize, 16);
8216
8217 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8218 // a multiple of 16.
8219 FuncInfo->setArgumentStackToRestore(StackArgSize);
8220
8221 // This realignment carries over to the available bytes below. Our own
8222 // callers will guarantee the space is free by giving an aligned value to
8223 // CALLSEQ_START.
8224 }
8225 // Even if we're not expected to free up the space, it's useful to know how
8226 // much is there while considering tail calls (because we can reuse it).
8227 FuncInfo->setBytesInStackArgArea(StackArgSize);
8228
8229 if (Subtarget->hasCustomCallingConv())
8231
8232 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8233 // will be expanded and stored in the static object later using a pseudonode.
8234 if (SMEAttrs(MF.getFunction()).hasZAState()) {
8235 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8236 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8237 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8238 DAG.getConstant(1, DL, MVT::i32));
8239
8240 SDValue Buffer;
8241 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8243 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8244 } else {
8245 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8246 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8247 DAG.getVTList(MVT::i64, MVT::Other),
8248 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8249 MFI.CreateVariableSizedObject(Align(16), nullptr);
8250 }
8251 Chain = DAG.getNode(
8252 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8253 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
8254 } else if (SMEAttrs(MF.getFunction()).hasAgnosticZAInterface()) {
8255 // Call __arm_sme_state_size().
8256 SDValue BufferSize =
8258 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8259 Chain = BufferSize.getValue(1);
8260
8261 SDValue Buffer;
8262 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8263 Buffer =
8265 DAG.getVTList(MVT::i64, MVT::Other), {Chain, BufferSize});
8266 } else {
8267 // Allocate space dynamically.
8268 Buffer = DAG.getNode(
8269 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8270 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8271 MFI.CreateVariableSizedObject(Align(16), nullptr);
8272 }
8273
8274 // Copy the value to a virtual register, and save that in FuncInfo.
8275 Register BufferPtr =
8276 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8277 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8278 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8279 }
8280
8281 if (CallConv == CallingConv::PreserveNone) {
8282 for (const ISD::InputArg &I : Ins) {
8283 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8284 I.Flags.isSwiftAsync()) {
8287 MF.getFunction(),
8288 "Swift attributes can't be used with preserve_none",
8289 DL.getDebugLoc()));
8290 break;
8291 }
8292 }
8293 }
8294
8295 return Chain;
8296}
8297
8298void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8299 SelectionDAG &DAG,
8300 const SDLoc &DL,
8301 SDValue &Chain) const {
8303 MachineFrameInfo &MFI = MF.getFrameInfo();
8305 auto PtrVT = getPointerTy(DAG.getDataLayout());
8306 Function &F = MF.getFunction();
8307 bool IsWin64 =
8308 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8309
8311
8313 unsigned NumGPRArgRegs = GPRArgRegs.size();
8314 if (Subtarget->isWindowsArm64EC()) {
8315 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8316 // functions.
8317 NumGPRArgRegs = 4;
8318 }
8319 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8320
8321 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8322 int GPRIdx = 0;
8323 if (GPRSaveSize != 0) {
8324 if (IsWin64) {
8325 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8326 if (GPRSaveSize & 15)
8327 // The extra size here, if triggered, will always be 8.
8328 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8329 } else
8330 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8331
8332 SDValue FIN;
8333 if (Subtarget->isWindowsArm64EC()) {
8334 // With the Arm64EC ABI, we reserve the save area as usual, but we
8335 // compute its address relative to x4. For a normal AArch64->AArch64
8336 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8337 // different address.
8338 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8339 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8340 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8341 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8342 } else {
8343 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8344 }
8345
8346 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8347 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8348 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8349 SDValue Store =
8350 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8352 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8353 : MachinePointerInfo::getStack(MF, i * 8));
8354 MemOps.push_back(Store);
8355 FIN =
8356 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8357 }
8358 }
8359 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8360 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8361
8362 if (Subtarget->hasFPARMv8() && !IsWin64) {
8364 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8365 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8366
8367 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8368 int FPRIdx = 0;
8369 if (FPRSaveSize != 0) {
8370 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8371
8372 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8373
8374 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8375 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8376 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8377
8378 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8379 MachinePointerInfo::getStack(MF, i * 16));
8380 MemOps.push_back(Store);
8381 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8382 DAG.getConstant(16, DL, PtrVT));
8383 }
8384 }
8385 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8386 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8387 }
8388
8389 if (!MemOps.empty()) {
8390 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8391 }
8392}
8393
8394/// LowerCallResult - Lower the result values of a call into the
8395/// appropriate copies out of appropriate physical registers.
8396SDValue AArch64TargetLowering::LowerCallResult(
8397 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8398 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8399 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8400 SDValue ThisVal, bool RequiresSMChange) const {
8401 DenseMap<unsigned, SDValue> CopiedRegs;
8402 // Copy all of the result registers out of their specified physreg.
8403 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8404 CCValAssign VA = RVLocs[i];
8405
8406 // Pass 'this' value directly from the argument to return value, to avoid
8407 // reg unit interference
8408 if (i == 0 && isThisReturn) {
8409 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8410 "unexpected return calling convention register assignment");
8411 InVals.push_back(ThisVal);
8412 continue;
8413 }
8414
8415 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8416 // allows one use of a physreg per block.
8417 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8418 if (!Val) {
8419 Val =
8420 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8421 Chain = Val.getValue(1);
8422 InGlue = Val.getValue(2);
8423 CopiedRegs[VA.getLocReg()] = Val;
8424 }
8425
8426 switch (VA.getLocInfo()) {
8427 default:
8428 llvm_unreachable("Unknown loc info!");
8429 case CCValAssign::Full:
8430 break;
8431 case CCValAssign::BCvt:
8432 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8433 break;
8435 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8436 DAG.getConstant(32, DL, VA.getLocVT()));
8437 [[fallthrough]];
8438 case CCValAssign::AExt:
8439 [[fallthrough]];
8440 case CCValAssign::ZExt:
8441 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8442 break;
8443 }
8444
8445 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8447 Val);
8448
8449 InVals.push_back(Val);
8450 }
8451
8452 return Chain;
8453}
8454
8455/// Return true if the calling convention is one that we can guarantee TCO for.
8456static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8457 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8459}
8460
8461/// Return true if we might ever do TCO for calls with this calling convention.
8463 switch (CC) {
8464 case CallingConv::C:
8469 case CallingConv::Swift:
8471 case CallingConv::Tail:
8472 case CallingConv::Fast:
8473 return true;
8474 default:
8475 return false;
8476 }
8477}
8478
8479/// Return true if the call convention supports varargs
8480/// Currently only those that pass varargs like the C
8481/// calling convention does are eligible
8482/// Calling conventions listed in this function must also
8483/// be properly handled in AArch64Subtarget::isCallingConvWin64
8485 switch (CC) {
8486 case CallingConv::C:
8488 return true;
8489 default:
8490 return false;
8491 }
8492}
8493
8495 const AArch64Subtarget *Subtarget,
8497 CCState &CCInfo) {
8498 const SelectionDAG &DAG = CLI.DAG;
8499 CallingConv::ID CalleeCC = CLI.CallConv;
8500 bool IsVarArg = CLI.IsVarArg;
8501 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8502 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8503
8504 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8505 // for the shadow store.
8506 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8507 CCInfo.AllocateStack(32, Align(16));
8508
8509 unsigned NumArgs = Outs.size();
8510 for (unsigned i = 0; i != NumArgs; ++i) {
8511 MVT ArgVT = Outs[i].VT;
8512 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8513
8514 bool UseVarArgCC = false;
8515 if (IsVarArg) {
8516 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8517 // too, so use the vararg CC to force them to integer registers.
8518 if (IsCalleeWin64) {
8519 UseVarArgCC = true;
8520 } else {
8521 UseVarArgCC = !Outs[i].IsFixed;
8522 }
8523 }
8524
8525 if (!UseVarArgCC) {
8526 // Get type of the original argument.
8527 EVT ActualVT =
8528 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8529 /*AllowUnknown*/ true);
8530 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8531 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8532 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8533 ArgVT = MVT::i8;
8534 else if (ActualMVT == MVT::i16)
8535 ArgVT = MVT::i16;
8536 }
8537
8538 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8539 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8540 assert(!Res && "Call operand has unhandled type");
8541 (void)Res;
8542 }
8543}
8544
8545bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8546 const CallLoweringInfo &CLI) const {
8547 CallingConv::ID CalleeCC = CLI.CallConv;
8548 if (!mayTailCallThisCC(CalleeCC))
8549 return false;
8550
8551 SDValue Callee = CLI.Callee;
8552 bool IsVarArg = CLI.IsVarArg;
8553 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8554 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8555 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8556 const SelectionDAG &DAG = CLI.DAG;
8558 const Function &CallerF = MF.getFunction();
8559 CallingConv::ID CallerCC = CallerF.getCallingConv();
8560
8561 // SME Streaming functions are not eligible for TCO as they may require
8562 // the streaming mode or ZA to be restored after returning from the call.
8563 SMEAttrs CallerAttrs(MF.getFunction());
8564 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
8565 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
8566 CallerAttrs.requiresLazySave(CalleeAttrs) ||
8567 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs) ||
8568 CallerAttrs.hasStreamingBody())
8569 return false;
8570
8571 // Functions using the C or Fast calling convention that have an SVE signature
8572 // preserve more registers and should assume the SVE_VectorCall CC.
8573 // The check for matching callee-saved regs will determine whether it is
8574 // eligible for TCO.
8575 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8578
8579 bool CCMatch = CallerCC == CalleeCC;
8580
8581 // When using the Windows calling convention on a non-windows OS, we want
8582 // to back up and restore X18 in such functions; we can't do a tail call
8583 // from those functions.
8584 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8585 CalleeCC != CallingConv::Win64)
8586 return false;
8587
8588 // Byval parameters hand the function a pointer directly into the stack area
8589 // we want to reuse during a tail call. Working around this *is* possible (see
8590 // X86) but less efficient and uglier in LowerCall.
8591 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8592 e = CallerF.arg_end();
8593 i != e; ++i) {
8594 if (i->hasByValAttr())
8595 return false;
8596
8597 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8598 // In this case, it is necessary to save/restore X0 in the callee. Tail
8599 // call opt interferes with this. So we disable tail call opt when the
8600 // caller has an argument with "inreg" attribute.
8601
8602 // FIXME: Check whether the callee also has an "inreg" argument.
8603 if (i->hasInRegAttr())
8604 return false;
8605 }
8606
8607 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8608 return CCMatch;
8609
8610 // Externally-defined functions with weak linkage should not be
8611 // tail-called on AArch64 when the OS does not support dynamic
8612 // pre-emption of symbols, as the AAELF spec requires normal calls
8613 // to undefined weak functions to be replaced with a NOP or jump to the
8614 // next instruction. The behaviour of branch instructions in this
8615 // situation (as used for tail calls) is implementation-defined, so we
8616 // cannot rely on the linker replacing the tail call with a return.
8617 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8618 const GlobalValue *GV = G->getGlobal();
8620 if (GV->hasExternalWeakLinkage() &&
8621 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8622 return false;
8623 }
8624
8625 // Now we search for cases where we can use a tail call without changing the
8626 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8627 // concept.
8628
8629 // I want anyone implementing a new calling convention to think long and hard
8630 // about this assert.
8631 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8632 report_fatal_error("Unsupported variadic calling convention");
8633
8634 LLVMContext &C = *DAG.getContext();
8635 // Check that the call results are passed in the same way.
8636 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8637 CCAssignFnForCall(CalleeCC, IsVarArg),
8638 CCAssignFnForCall(CallerCC, IsVarArg)))
8639 return false;
8640 // The callee has to preserve all registers the caller needs to preserve.
8641 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8642 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8643 if (!CCMatch) {
8644 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8645 if (Subtarget->hasCustomCallingConv()) {
8646 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8647 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8648 }
8649 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8650 return false;
8651 }
8652
8653 // Nothing more to check if the callee is taking no arguments
8654 if (Outs.empty())
8655 return true;
8656
8658 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8659
8660 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8661
8662 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8663 // When we are musttail, additional checks have been done and we can safely ignore this check
8664 // At least two cases here: if caller is fastcc then we can't have any
8665 // memory arguments (we'd be expected to clean up the stack afterwards). If
8666 // caller is C then we could potentially use its argument area.
8667
8668 // FIXME: for now we take the most conservative of these in both cases:
8669 // disallow all variadic memory operands.
8670 for (const CCValAssign &ArgLoc : ArgLocs)
8671 if (!ArgLoc.isRegLoc())
8672 return false;
8673 }
8674
8675 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8676
8677 // If any of the arguments is passed indirectly, it must be SVE, so the
8678 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8679 // allocate space on the stack. That is why we determine this explicitly here
8680 // the call cannot be a tailcall.
8681 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8682 assert((A.getLocInfo() != CCValAssign::Indirect ||
8683 A.getValVT().isScalableVector() ||
8684 Subtarget->isWindowsArm64EC()) &&
8685 "Expected value to be scalable");
8686 return A.getLocInfo() == CCValAssign::Indirect;
8687 }))
8688 return false;
8689
8690 // If the stack arguments for this call do not fit into our own save area then
8691 // the call cannot be made tail.
8692 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8693 return false;
8694
8695 const MachineRegisterInfo &MRI = MF.getRegInfo();
8696 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8697 return false;
8698
8699 return true;
8700}
8701
8702SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8703 SelectionDAG &DAG,
8704 MachineFrameInfo &MFI,
8705 int ClobberedFI) const {
8706 SmallVector<SDValue, 8> ArgChains;
8707 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8708 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8709
8710 // Include the original chain at the beginning of the list. When this is
8711 // used by target LowerCall hooks, this helps legalize find the
8712 // CALLSEQ_BEGIN node.
8713 ArgChains.push_back(Chain);
8714
8715 // Add a chain value for each stack argument corresponding
8716 for (SDNode *U : DAG.getEntryNode().getNode()->users())
8717 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8718 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8719 if (FI->getIndex() < 0) {
8720 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8721 int64_t InLastByte = InFirstByte;
8722 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8723
8724 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8725 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8726 ArgChains.push_back(SDValue(L, 1));
8727 }
8728
8729 // Build a tokenfactor for all the chains.
8730 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8731}
8732
8733bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8734 bool TailCallOpt) const {
8735 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8736 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8737}
8738
8739// Check if the value is zero-extended from i1 to i8
8740static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8741 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8742 if (SizeInBits < 8)
8743 return false;
8744
8745 APInt RequredZero(SizeInBits, 0xFE);
8746 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8747 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8748 return ZExtBool;
8749}
8750
8751// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
8752// input operands are copy nodes where the source register is in a
8753// StridedOrContiguous class. For example:
8754//
8755// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
8756// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
8757// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
8758// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
8759// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
8760// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
8761// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
8762//
8764 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
8765
8766 const TargetRegisterClass *RegClass = nullptr;
8767 switch (MI.getOpcode()) {
8768 case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
8769 RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
8770 break;
8771 case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
8772 RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
8773 break;
8774 default:
8775 llvm_unreachable("Unexpected opcode.");
8776 }
8777
8779 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8780 MachineOperand &MO = MI.getOperand(I);
8781 assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
8782
8783 MachineOperand *Def = MRI.getOneDef(MO.getReg());
8784 if (!Def || !Def->getParent()->isCopy())
8785 return false;
8786
8787 const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
8788 unsigned OpSubReg = CopySrc.getSubReg();
8790 SubReg = OpSubReg;
8791
8792 MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
8793 if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
8794 MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
8795 return false;
8796 }
8797
8798 return true;
8799}
8800
8801void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8802 SDNode *Node) const {
8803 // Live-in physreg copies that are glued to SMSTART are applied as
8804 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8805 // register allocator to pass call args in callee saved regs, without extra
8806 // copies to avoid these fake clobbers of actually-preserved GPRs.
8807 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8808 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8809 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8810 if (MachineOperand &MO = MI.getOperand(I);
8811 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8812 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8813 AArch64::GPR64RegClass.contains(MO.getReg())))
8814 MI.removeOperand(I);
8815
8816 // The SVE vector length can change when entering/leaving streaming mode.
8817 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8818 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8819 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8820 /*IsImplicit=*/true));
8821 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8822 /*IsImplicit=*/true));
8823 }
8824 }
8825
8826 if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8827 MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
8828 // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
8829 // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
8831 return;
8832
8833 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8834 MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
8835 TII->get(TargetOpcode::REG_SEQUENCE),
8836 MI.getOperand(0).getReg());
8837
8838 for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8839 MIB.add(MI.getOperand(I));
8840 MIB.addImm(AArch64::zsub0 + (I - 1));
8841 }
8842
8843 MI.eraseFromParent();
8844 return;
8845 }
8846
8847 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8848 // have nothing to do with VG, were it not that they are used to materialise a
8849 // frame-address. If they contain a frame-index to a scalable vector, this
8850 // will likely require an ADDVL instruction to materialise the address, thus
8851 // reading VG.
8852 const MachineFunction &MF = *MI.getMF();
8854 (MI.getOpcode() == AArch64::ADDXri ||
8855 MI.getOpcode() == AArch64::SUBXri)) {
8856 const MachineOperand &MO = MI.getOperand(1);
8857 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8859 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8860 /*IsImplicit=*/true));
8861 }
8862}
8863
8865 bool Enable, SDValue Chain,
8866 SDValue InGlue,
8867 unsigned Condition,
8868 SDValue PStateSM) const {
8871 FuncInfo->setHasStreamingModeChanges(true);
8872
8873 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8874 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8875 SDValue MSROp =
8876 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8877 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8878 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8879 if (Condition != AArch64SME::Always) {
8880 assert(PStateSM && "PStateSM should be defined");
8881 Ops.push_back(PStateSM);
8882 }
8883 Ops.push_back(RegMask);
8884
8885 if (InGlue)
8886 Ops.push_back(InGlue);
8887
8888 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8889 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8890}
8891
8892// Emit a call to __arm_sme_save or __arm_sme_restore.
8894 SelectionDAG &DAG,
8896 SDValue Chain, bool IsSave) {
8899 FuncInfo->setSMESaveBufferUsed();
8900
8903 Entry.Ty = PointerType::getUnqual(*DAG.getContext());
8904 Entry.Node =
8905 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64);
8906 Args.push_back(Entry);
8907
8908 SDValue Callee =
8909 DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8910 TLI.getPointerTy(DAG.getDataLayout()));
8911 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8913 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8915 Callee, std::move(Args));
8916 return TLI.LowerCallTo(CLI).second;
8917}
8918
8919static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8920 const SMEAttrs &CalleeAttrs) {
8921 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8922 CallerAttrs.hasStreamingBody())
8923 return AArch64SME::Always;
8924 if (CalleeAttrs.hasNonStreamingInterface())
8926 if (CalleeAttrs.hasStreamingInterface())
8928
8929 llvm_unreachable("Unsupported attributes");
8930}
8931
8932/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8933/// and add input and output parameter nodes.
8934SDValue
8935AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8936 SmallVectorImpl<SDValue> &InVals) const {
8937 SelectionDAG &DAG = CLI.DAG;
8938 SDLoc &DL = CLI.DL;
8939 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8940 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8942 SDValue Chain = CLI.Chain;
8943 SDValue Callee = CLI.Callee;
8944 bool &IsTailCall = CLI.IsTailCall;
8945 CallingConv::ID &CallConv = CLI.CallConv;
8946 bool IsVarArg = CLI.IsVarArg;
8947
8950 bool IsThisReturn = false;
8951
8953 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8954 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8955 bool IsSibCall = false;
8956 bool GuardWithBTI = false;
8957
8958 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8959 !Subtarget->noBTIAtReturnTwice()) {
8960 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8961 }
8962
8963 // Analyze operands of the call, assigning locations to each operand.
8965 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8966
8967 if (IsVarArg) {
8968 unsigned NumArgs = Outs.size();
8969
8970 for (unsigned i = 0; i != NumArgs; ++i) {
8971 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8972 report_fatal_error("Passing SVE types to variadic functions is "
8973 "currently not supported");
8974 }
8975 }
8976
8977 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8978
8979 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8980 // Assign locations to each value returned by this call.
8982 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8983 *DAG.getContext());
8984 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8985
8986 // Check callee args/returns for SVE registers and set calling convention
8987 // accordingly.
8988 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8989 auto HasSVERegLoc = [](CCValAssign &Loc) {
8990 if (!Loc.isRegLoc())
8991 return false;
8992 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8993 AArch64::PPRRegClass.contains(Loc.getLocReg());
8994 };
8995 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8997 }
8998
8999 if (IsTailCall) {
9000 // Check if it's really possible to do a tail call.
9001 IsTailCall = isEligibleForTailCallOptimization(CLI);
9002
9003 // A sibling call is one where we're under the usual C ABI and not planning
9004 // to change that but can still do a tail call:
9005 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
9006 CallConv != CallingConv::SwiftTail)
9007 IsSibCall = true;
9008
9009 if (IsTailCall)
9010 ++NumTailCalls;
9011 }
9012
9013 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9014 report_fatal_error("failed to perform tail call elimination on a call "
9015 "site marked musttail");
9016
9017 // Get a count of how many bytes are to be pushed on the stack.
9018 unsigned NumBytes = CCInfo.getStackSize();
9019
9020 if (IsSibCall) {
9021 // Since we're not changing the ABI to make this a tail call, the memory
9022 // operands are already available in the caller's incoming argument space.
9023 NumBytes = 0;
9024 }
9025
9026 // FPDiff is the byte offset of the call's argument area from the callee's.
9027 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9028 // by this amount for a tail call. In a sibling call it must be 0 because the
9029 // caller will deallocate the entire stack and the callee still expects its
9030 // arguments to begin at SP+0. Completely unused for non-tail calls.
9031 int FPDiff = 0;
9032
9033 if (IsTailCall && !IsSibCall) {
9034 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9035
9036 // Since callee will pop argument stack as a tail call, we must keep the
9037 // popped size 16-byte aligned.
9038 NumBytes = alignTo(NumBytes, 16);
9039
9040 // FPDiff will be negative if this tail call requires more space than we
9041 // would automatically have in our incoming argument space. Positive if we
9042 // can actually shrink the stack.
9043 FPDiff = NumReusableBytes - NumBytes;
9044
9045 // Update the required reserved area if this is the tail call requiring the
9046 // most argument stack space.
9047 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9048 FuncInfo->setTailCallReservedStack(-FPDiff);
9049
9050 // The stack pointer must be 16-byte aligned at all times it's used for a
9051 // memory operation, which in practice means at *all* times and in
9052 // particular across call boundaries. Therefore our own arguments started at
9053 // a 16-byte aligned SP and the delta applied for the tail call should
9054 // satisfy the same constraint.
9055 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9056 }
9057
9058 // Determine whether we need any streaming mode changes.
9059 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
9060 if (CLI.CB)
9061 CalleeAttrs = SMEAttrs(*CLI.CB);
9062 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9063 CalleeAttrs = SMEAttrs(ES->getSymbol());
9064
9065 auto DescribeCallsite =
9067 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9068 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9069 R << ore::NV("Callee", ES->getSymbol());
9070 else if (CLI.CB && CLI.CB->getCalledFunction())
9071 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9072 else
9073 R << "unknown callee";
9074 R << "'";
9075 return R;
9076 };
9077
9078 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
9079 bool RequiresSaveAllZA =
9080 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs);
9081 if (RequiresLazySave) {
9082 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9083 MachinePointerInfo MPI =
9085 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9086 TPIDR2.FrameIndex,
9088 SDValue NumZaSaveSlicesAddr =
9089 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
9090 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
9091 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9092 DAG.getConstant(1, DL, MVT::i32));
9093 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
9094 MPI, MVT::i16);
9095 Chain = DAG.getNode(
9096 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9097 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9098 TPIDR2ObjAddr);
9100 ORE.emit([&]() {
9101 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9102 CLI.CB)
9103 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9104 &MF.getFunction());
9105 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9106 });
9107 } else if (RequiresSaveAllZA) {
9108 assert(!CalleeAttrs.hasSharedZAInterface() &&
9109 "Cannot share state that may not exist");
9110 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9111 /*IsSave=*/true);
9112 }
9113
9114 SDValue PStateSM;
9115 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
9116 if (RequiresSMChange) {
9117 if (CallerAttrs.hasStreamingInterfaceOrBody())
9118 PStateSM = DAG.getConstant(1, DL, MVT::i64);
9119 else if (CallerAttrs.hasNonStreamingInterface())
9120 PStateSM = DAG.getConstant(0, DL, MVT::i64);
9121 else
9122 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
9124 ORE.emit([&]() {
9125 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9126 CLI.CB)
9127 : OptimizationRemarkAnalysis("sme", "SMETransition",
9128 &MF.getFunction());
9129 DescribeCallsite(R) << " requires a streaming mode transition";
9130 return R;
9131 });
9132 }
9133
9134 SDValue ZTFrameIdx;
9135 MachineFrameInfo &MFI = MF.getFrameInfo();
9136 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
9137
9138 // If the caller has ZT0 state which will not be preserved by the callee,
9139 // spill ZT0 before the call.
9140 if (ShouldPreserveZT0) {
9141 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
9142 ZTFrameIdx = DAG.getFrameIndex(
9143 ZTObj,
9145
9146 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9147 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9148 }
9149
9150 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9151 // PSTATE.ZA before the call if there is no lazy-save active.
9152 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
9153 assert((!DisableZA || !RequiresLazySave) &&
9154 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9155
9156 if (DisableZA)
9157 Chain = DAG.getNode(
9158 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
9159 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9160 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9161
9162 // Adjust the stack pointer for the new arguments...
9163 // These operations are automatically eliminated by the prolog/epilog pass
9164 if (!IsSibCall)
9165 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9166
9167 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9169
9171 SmallSet<unsigned, 8> RegsUsed;
9172 SmallVector<SDValue, 8> MemOpChains;
9173 auto PtrVT = getPointerTy(DAG.getDataLayout());
9174
9175 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9176 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9177 for (const auto &F : Forwards) {
9178 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9179 RegsToPass.emplace_back(F.PReg, Val);
9180 }
9181 }
9182
9183 // Walk the register/memloc assignments, inserting copies/loads.
9184 unsigned ExtraArgLocs = 0;
9185 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9186 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9187 SDValue Arg = OutVals[i];
9188 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9189
9190 // Promote the value if needed.
9191 switch (VA.getLocInfo()) {
9192 default:
9193 llvm_unreachable("Unknown loc info!");
9194 case CCValAssign::Full:
9195 break;
9196 case CCValAssign::SExt:
9197 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9198 break;
9199 case CCValAssign::ZExt:
9200 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9201 break;
9202 case CCValAssign::AExt:
9203 if (Outs[i].ArgVT == MVT::i1) {
9204 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9205 //
9206 // Check if we actually have to do this, because the value may
9207 // already be zero-extended.
9208 //
9209 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9210 // and rely on DAGCombiner to fold this, because the following
9211 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9212 //
9213 // (ext (zext x)) -> (zext x)
9214 //
9215 // This will give us (zext i32), which we cannot remove, so
9216 // try to check this beforehand.
9217 if (!checkZExtBool(Arg, DAG)) {
9218 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9219 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9220 }
9221 }
9222 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9223 break;
9225 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9226 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9227 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9228 DAG.getConstant(32, DL, VA.getLocVT()));
9229 break;
9230 case CCValAssign::BCvt:
9231 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9232 break;
9233 case CCValAssign::Trunc:
9234 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9235 break;
9236 case CCValAssign::FPExt:
9237 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9238 break;
9240 bool isScalable = VA.getValVT().isScalableVT();
9241 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9242 "Indirect arguments should be scalable on most subtargets");
9243
9244 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9245 uint64_t PartSize = StoreSize;
9246 unsigned NumParts = 1;
9247 if (Outs[i].Flags.isInConsecutiveRegs()) {
9248 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9249 ++NumParts;
9250 StoreSize *= NumParts;
9251 }
9252
9253 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9254 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9255 MachineFrameInfo &MFI = MF.getFrameInfo();
9256 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9257 if (isScalable)
9259
9263 SDValue SpillSlot = Ptr;
9264
9265 // Ensure we generate all stores for each tuple part, whilst updating the
9266 // pointer after each store correctly using vscale.
9267 while (NumParts) {
9268 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9269 MemOpChains.push_back(Store);
9270
9271 NumParts--;
9272 if (NumParts > 0) {
9273 SDValue BytesIncrement;
9274 if (isScalable) {
9275 BytesIncrement = DAG.getVScale(
9276 DL, Ptr.getValueType(),
9277 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9278 } else {
9279 BytesIncrement = DAG.getConstant(
9280 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9281 Ptr.getValueType());
9282 }
9283 MPI = MachinePointerInfo(MPI.getAddrSpace());
9284 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9285 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9286 ExtraArgLocs++;
9287 i++;
9288 }
9289 }
9290
9291 Arg = SpillSlot;
9292 break;
9293 }
9294
9295 if (VA.isRegLoc()) {
9296 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9297 Outs[0].VT == MVT::i64) {
9298 assert(VA.getLocVT() == MVT::i64 &&
9299 "unexpected calling convention register assignment");
9300 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9301 "unexpected use of 'returned'");
9302 IsThisReturn = true;
9303 }
9304 if (RegsUsed.count(VA.getLocReg())) {
9305 // If this register has already been used then we're trying to pack
9306 // parts of an [N x i32] into an X-register. The extension type will
9307 // take care of putting the two halves in the right place but we have to
9308 // combine them.
9309 SDValue &Bits =
9310 llvm::find_if(RegsToPass,
9311 [=](const std::pair<unsigned, SDValue> &Elt) {
9312 return Elt.first == VA.getLocReg();
9313 })
9314 ->second;
9315 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9316 // Call site info is used for function's parameter entry value
9317 // tracking. For now we track only simple cases when parameter
9318 // is transferred through whole register.
9320 [&VA](MachineFunction::ArgRegPair ArgReg) {
9321 return ArgReg.Reg == VA.getLocReg();
9322 });
9323 } else {
9324 // Add an extra level of indirection for streaming mode changes by
9325 // using a pseudo copy node that cannot be rematerialised between a
9326 // smstart/smstop and the call by the simple register coalescer.
9327 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9329 Arg.getValueType(), Arg);
9330 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9331 RegsUsed.insert(VA.getLocReg());
9332 const TargetOptions &Options = DAG.getTarget().Options;
9333 if (Options.EmitCallSiteInfo)
9334 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9335 }
9336 } else {
9337 assert(VA.isMemLoc());
9338
9339 SDValue DstAddr;
9340 MachinePointerInfo DstInfo;
9341
9342 // FIXME: This works on big-endian for composite byvals, which are the
9343 // common case. It should also work for fundamental types too.
9344 uint32_t BEAlign = 0;
9345 unsigned OpSize;
9346 if (VA.getLocInfo() == CCValAssign::Indirect ||
9348 OpSize = VA.getLocVT().getFixedSizeInBits();
9349 else
9350 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9351 : VA.getValVT().getSizeInBits();
9352 OpSize = (OpSize + 7) / 8;
9353 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9354 !Flags.isInConsecutiveRegs()) {
9355 if (OpSize < 8)
9356 BEAlign = 8 - OpSize;
9357 }
9358 unsigned LocMemOffset = VA.getLocMemOffset();
9359 int32_t Offset = LocMemOffset + BEAlign;
9360 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9361 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9362
9363 if (IsTailCall) {
9364 Offset = Offset + FPDiff;
9365 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9366
9367 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9368 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9369
9370 // Make sure any stack arguments overlapping with where we're storing
9371 // are loaded before this eventual operation. Otherwise they'll be
9372 // clobbered.
9373 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9374 } else {
9375 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9376
9377 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9378 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9379 }
9380
9381 if (Outs[i].Flags.isByVal()) {
9382 SDValue SizeNode =
9383 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9384 SDValue Cpy = DAG.getMemcpy(
9385 Chain, DL, DstAddr, Arg, SizeNode,
9386 Outs[i].Flags.getNonZeroByValAlign(),
9387 /*isVol = */ false, /*AlwaysInline = */ false,
9388 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9389
9390 MemOpChains.push_back(Cpy);
9391 } else {
9392 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9393 // promoted to a legal register type i32, we should truncate Arg back to
9394 // i1/i8/i16.
9395 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9396 VA.getValVT() == MVT::i16)
9397 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9398
9399 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9400 MemOpChains.push_back(Store);
9401 }
9402 }
9403 }
9404
9405 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
9406 SDValue ParamPtr = StackPtr;
9407 if (IsTailCall) {
9408 // Create a dummy object at the top of the stack that can be used to get
9409 // the SP after the epilogue
9410 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9411 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9412 }
9413
9414 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9415 // describing the argument list. x4 contains the address of the
9416 // first stack parameter. x5 contains the size in bytes of all parameters
9417 // passed on the stack.
9418 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9419 RegsToPass.emplace_back(AArch64::X5,
9420 DAG.getConstant(NumBytes, DL, MVT::i64));
9421 }
9422
9423 if (!MemOpChains.empty())
9424 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9425
9426 SDValue InGlue;
9427 if (RequiresSMChange) {
9428 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9429 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
9430 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
9431 InGlue = Chain.getValue(1);
9432 }
9433
9434 SDValue NewChain = changeStreamingMode(
9435 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
9436 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9437 Chain = NewChain.getValue(0);
9438 InGlue = NewChain.getValue(1);
9439 }
9440
9441 // Build a sequence of copy-to-reg nodes chained together with token chain
9442 // and flag operands which copy the outgoing args into the appropriate regs.
9443 for (auto &RegToPass : RegsToPass) {
9444 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9445 RegToPass.second, InGlue);
9446 InGlue = Chain.getValue(1);
9447 }
9448
9449 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9450 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9451 // node so that legalize doesn't hack it.
9452 const GlobalValue *CalledGlobal = nullptr;
9453 unsigned OpFlags = 0;
9454 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9455 CalledGlobal = G->getGlobal();
9456 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9458 if (OpFlags & AArch64II::MO_GOT) {
9459 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9460 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9461 } else {
9462 const GlobalValue *GV = G->getGlobal();
9463 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9464 }
9465 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9466 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9467 Subtarget->isTargetMachO()) ||
9469 const char *Sym = S->getSymbol();
9470 if (UseGot) {
9472 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9473 } else {
9474 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9475 }
9476 }
9477
9478 // We don't usually want to end the call-sequence here because we would tidy
9479 // the frame up *after* the call, however in the ABI-changing tail-call case
9480 // we've carefully laid out the parameters so that when sp is reset they'll be
9481 // in the correct location.
9482 if (IsTailCall && !IsSibCall) {
9483 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9484 InGlue = Chain.getValue(1);
9485 }
9486
9487 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9488
9489 std::vector<SDValue> Ops;
9490 Ops.push_back(Chain);
9491 Ops.push_back(Callee);
9492
9493 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9494 // be expanded to the call, directly followed by a special marker sequence and
9495 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9496 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9497 assert(!IsTailCall &&
9498 "tail calls cannot be marked with clang.arc.attachedcall");
9500
9501 // Add a target global address for the retainRV/claimRV runtime function
9502 // just before the call target.
9503 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9504 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9505 Ops.insert(Ops.begin() + 1, GA);
9506 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9508 } else if (GuardWithBTI) {
9510 }
9511
9512 if (IsTailCall) {
9513 // Each tail call may have to adjust the stack by a different amount, so
9514 // this information must travel along with the operation for eventual
9515 // consumption by emitEpilogue.
9516 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9517 }
9518
9519 if (CLI.PAI) {
9520 const uint64_t Key = CLI.PAI->Key;
9521 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
9522 "Invalid auth call key");
9523
9524 // Split the discriminator into address/integer components.
9525 SDValue AddrDisc, IntDisc;
9526 std::tie(IntDisc, AddrDisc) =
9527 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9528
9529 if (Opc == AArch64ISD::CALL_RVMARKER)
9531 else
9533 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9534 Ops.push_back(IntDisc);
9535 Ops.push_back(AddrDisc);
9536 }
9537
9538 // Add argument registers to the end of the list so that they are known live
9539 // into the call.
9540 for (auto &RegToPass : RegsToPass)
9541 Ops.push_back(DAG.getRegister(RegToPass.first,
9542 RegToPass.second.getValueType()));
9543
9544 // Add a register mask operand representing the call-preserved registers.
9545 const uint32_t *Mask;
9546 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9547 if (IsThisReturn) {
9548 // For 'this' returns, use the X0-preserving mask if applicable
9549 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9550 if (!Mask) {
9551 IsThisReturn = false;
9552 Mask = TRI->getCallPreservedMask(MF, CallConv);
9553 }
9554 } else
9555 Mask = TRI->getCallPreservedMask(MF, CallConv);
9556
9557 if (Subtarget->hasCustomCallingConv())
9558 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9559
9560 if (TRI->isAnyArgRegReserved(MF))
9561 TRI->emitReservedArgRegCallError(MF);
9562
9563 assert(Mask && "Missing call preserved mask for calling convention");
9564 Ops.push_back(DAG.getRegisterMask(Mask));
9565
9566 if (InGlue.getNode())
9567 Ops.push_back(InGlue);
9568
9569 // If we're doing a tall call, use a TC_RETURN here rather than an
9570 // actual call instruction.
9571 if (IsTailCall) {
9573 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9574 if (IsCFICall)
9575 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9576
9577 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9578 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9579 if (CalledGlobal)
9580 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9581 return Ret;
9582 }
9583
9584 // Returns a chain and a flag for retval copy to use.
9585 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9586 if (IsCFICall)
9587 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9588
9589 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9590 InGlue = Chain.getValue(1);
9591 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9592 if (CalledGlobal)
9593 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9594
9595 uint64_t CalleePopBytes =
9596 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9597
9598 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9599 InGlue = Chain.getValue(1);
9600
9601 // Handle result values, copying them out of physregs into vregs that we
9602 // return.
9603 SDValue Result = LowerCallResult(
9604 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9605 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9606
9607 if (!Ins.empty())
9608 InGlue = Result.getValue(Result->getNumValues() - 1);
9609
9610 if (RequiresSMChange) {
9611 assert(PStateSM && "Expected a PStateSM to be set");
9613 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
9614 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
9615
9616 if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
9617 InGlue = Result.getValue(1);
9618 Result =
9620 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
9621 }
9622 }
9623
9624 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
9625 // Unconditionally resume ZA.
9626 Result = DAG.getNode(
9627 AArch64ISD::SMSTART, DL, MVT::Other, Result,
9628 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
9629 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
9630
9631 if (ShouldPreserveZT0)
9632 Result =
9633 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
9634 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9635
9636 if (RequiresLazySave) {
9637 // Conditionally restore the lazy save using a pseudo node.
9638 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9639 SDValue RegMask = DAG.getRegisterMask(
9640 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9641 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9642 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
9643 SDValue TPIDR2_EL0 = DAG.getNode(
9644 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
9645 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
9646
9647 // Copy the address of the TPIDR2 block into X0 before 'calling' the
9648 // RESTORE_ZA pseudo.
9649 SDValue Glue;
9650 SDValue TPIDR2Block = DAG.getFrameIndex(
9651 TPIDR2.FrameIndex,
9653 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
9654 Result =
9655 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
9656 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
9657 RestoreRoutine, RegMask, Result.getValue(1)});
9658
9659 // Finally reset the TPIDR2_EL0 register to 0.
9660 Result = DAG.getNode(
9661 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
9662 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9663 DAG.getConstant(0, DL, MVT::i64));
9664 TPIDR2.Uses++;
9665 } else if (RequiresSaveAllZA) {
9666 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
9667 /*IsSave=*/false);
9668 }
9669
9670 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
9671 RequiresSaveAllZA) {
9672 for (unsigned I = 0; I < InVals.size(); ++I) {
9673 // The smstart/smstop is chained as part of the call, but when the
9674 // resulting chain is discarded (which happens when the call is not part
9675 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9676 // smstart/smstop is chained to the result value. We can do that by doing
9677 // a vreg -> vreg copy.
9680 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
9681 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
9682 InVals[I].getValueType());
9683 }
9684 }
9685
9686 if (CallConv == CallingConv::PreserveNone) {
9687 for (const ISD::OutputArg &O : Outs) {
9688 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
9689 O.Flags.isSwiftAsync()) {
9692 MF.getFunction(),
9693 "Swift attributes can't be used with preserve_none",
9694 DL.getDebugLoc()));
9695 break;
9696 }
9697 }
9698 }
9699
9700 return Result;
9701}
9702
9703bool AArch64TargetLowering::CanLowerReturn(
9704 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9705 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9706 const Type *RetTy) const {
9707 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9709 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9710 return CCInfo.CheckReturn(Outs, RetCC);
9711}
9712
9713SDValue
9714AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9715 bool isVarArg,
9717 const SmallVectorImpl<SDValue> &OutVals,
9718 const SDLoc &DL, SelectionDAG &DAG) const {
9719 auto &MF = DAG.getMachineFunction();
9720 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9721
9722 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9724 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9725 CCInfo.AnalyzeReturn(Outs, RetCC);
9726
9727 // Copy the result values into the output registers.
9728 SDValue Glue;
9730 SmallSet<unsigned, 4> RegsUsed;
9731 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9732 ++i, ++realRVLocIdx) {
9733 CCValAssign &VA = RVLocs[i];
9734 assert(VA.isRegLoc() && "Can only return in registers!");
9735 SDValue Arg = OutVals[realRVLocIdx];
9736
9737 switch (VA.getLocInfo()) {
9738 default:
9739 llvm_unreachable("Unknown loc info!");
9740 case CCValAssign::Full:
9741 if (Outs[i].ArgVT == MVT::i1) {
9742 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9743 // value. This is strictly redundant on Darwin (which uses "zeroext
9744 // i1"), but will be optimised out before ISel.
9745 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9746 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9747 }
9748 break;
9749 case CCValAssign::BCvt:
9750 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9751 break;
9752 case CCValAssign::AExt:
9753 case CCValAssign::ZExt:
9754 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9755 break;
9757 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9758 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9759 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9760 DAG.getConstant(32, DL, VA.getLocVT()));
9761 break;
9762 }
9763
9764 if (RegsUsed.count(VA.getLocReg())) {
9765 SDValue &Bits =
9766 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9767 return Elt.first == VA.getLocReg();
9768 })->second;
9769 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9770 } else {
9771 RetVals.emplace_back(VA.getLocReg(), Arg);
9772 RegsUsed.insert(VA.getLocReg());
9773 }
9774 }
9775
9776 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9777
9778 // Emit SMSTOP before returning from a locally streaming function
9779 SMEAttrs FuncAttrs(MF.getFunction());
9780 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9781 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9782 Register Reg = FuncInfo->getPStateSMReg();
9783 assert(Reg.isValid() && "PStateSM Register is invalid");
9784 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
9785 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9786 /*Glue*/ SDValue(),
9788 } else
9789 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9790 /*Glue*/ SDValue(), AArch64SME::Always);
9791 Glue = Chain.getValue(1);
9792 }
9793
9794 SmallVector<SDValue, 4> RetOps(1, Chain);
9795 for (auto &RetVal : RetVals) {
9796 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9797 isPassedInFPR(RetVal.second.getValueType()))
9798 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9799 RetVal.second.getValueType(), RetVal.second);
9800 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9801 Glue = Chain.getValue(1);
9802 RetOps.push_back(
9803 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9804 }
9805
9806 // Windows AArch64 ABIs require that for returning structs by value we copy
9807 // the sret argument into X0 for the return.
9808 // We saved the argument into a virtual register in the entry block,
9809 // so now we copy the value out and into X0.
9810 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9811 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9813
9814 unsigned RetValReg = AArch64::X0;
9815 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9816 RetValReg = AArch64::X8;
9817 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9818 Glue = Chain.getValue(1);
9819
9820 RetOps.push_back(
9821 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9822 }
9823
9824 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9825 if (I) {
9826 for (; *I; ++I) {
9827 if (AArch64::GPR64RegClass.contains(*I))
9828 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9829 else if (AArch64::FPR64RegClass.contains(*I))
9830 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9831 else
9832 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9833 }
9834 }
9835
9836 RetOps[0] = Chain; // Update chain.
9837
9838 // Add the glue if we have it.
9839 if (Glue.getNode())
9840 RetOps.push_back(Glue);
9841
9842 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9843 // ARM64EC entry thunks use a special return sequence: instead of a regular
9844 // "ret" instruction, they need to explicitly call the emulator.
9845 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9846 SDValue Arm64ECRetDest =
9847 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9848 Arm64ECRetDest =
9849 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9850 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9852 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9853 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9854 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9855 }
9856
9857 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9858}
9859
9860//===----------------------------------------------------------------------===//
9861// Other Lowering Code
9862//===----------------------------------------------------------------------===//
9863
9864SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9865 SelectionDAG &DAG,
9866 unsigned Flag) const {
9867 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9868 N->getOffset(), Flag);
9869}
9870
9871SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9872 SelectionDAG &DAG,
9873 unsigned Flag) const {
9874 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9875}
9876
9877SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9878 SelectionDAG &DAG,
9879 unsigned Flag) const {
9880 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9881 N->getOffset(), Flag);
9882}
9883
9884SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9885 SelectionDAG &DAG,
9886 unsigned Flag) const {
9887 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9888}
9889
9890SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9891 SelectionDAG &DAG,
9892 unsigned Flag) const {
9893 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9894}
9895
9896// (loadGOT sym)
9897template <class NodeTy>
9898SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9899 unsigned Flags) const {
9900 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9901 SDLoc DL(N);
9902 EVT Ty = getPointerTy(DAG.getDataLayout());
9903 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9904 // FIXME: Once remat is capable of dealing with instructions with register
9905 // operands, expand this into two nodes instead of using a wrapper node.
9906 if (DAG.getMachineFunction()
9908 ->hasELFSignedGOT())
9909 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
9910 0);
9911 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9912}
9913
9914// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9915template <class NodeTy>
9916SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9917 unsigned Flags) const {
9918 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9919 SDLoc DL(N);
9920 EVT Ty = getPointerTy(DAG.getDataLayout());
9921 const unsigned char MO_NC = AArch64II::MO_NC;
9922 return DAG.getNode(
9924 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9925 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9926 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9927 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9928}
9929
9930// (addlow (adrp %hi(sym)) %lo(sym))
9931template <class NodeTy>
9932SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9933 unsigned Flags) const {
9934 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9935 SDLoc DL(N);
9936 EVT Ty = getPointerTy(DAG.getDataLayout());
9937 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9938 SDValue Lo = getTargetNode(N, Ty, DAG,
9941 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9942}
9943
9944// (adr sym)
9945template <class NodeTy>
9946SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9947 unsigned Flags) const {
9948 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9949 SDLoc DL(N);
9950 EVT Ty = getPointerTy(DAG.getDataLayout());
9951 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9952 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9953}
9954
9955SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9956 SelectionDAG &DAG) const {
9957 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9958 const GlobalValue *GV = GN->getGlobal();
9959 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9960
9961 if (OpFlags != AArch64II::MO_NO_FLAG)
9962 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9963 "unexpected offset in global node");
9964
9965 // This also catches the large code model case for Darwin, and tiny code
9966 // model with got relocations.
9967 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9968 return getGOT(GN, DAG, OpFlags);
9969 }
9970
9974 Result = getAddrLarge(GN, DAG, OpFlags);
9975 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9976 Result = getAddrTiny(GN, DAG, OpFlags);
9977 } else {
9978 Result = getAddr(GN, DAG, OpFlags);
9979 }
9980 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9981 SDLoc DL(GN);
9983 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9985 return Result;
9986}
9987
9988/// Convert a TLS address reference into the correct sequence of loads
9989/// and calls to compute the variable's address (for Darwin, currently) and
9990/// return an SDValue containing the final node.
9991
9992/// Darwin only has one TLS scheme which must be capable of dealing with the
9993/// fully general situation, in the worst case. This means:
9994/// + "extern __thread" declaration.
9995/// + Defined in a possibly unknown dynamic library.
9996///
9997/// The general system is that each __thread variable has a [3 x i64] descriptor
9998/// which contains information used by the runtime to calculate the address. The
9999/// only part of this the compiler needs to know about is the first xword, which
10000/// contains a function pointer that must be called with the address of the
10001/// entire descriptor in "x0".
10002///
10003/// Since this descriptor may be in a different unit, in general even the
10004/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10005/// is:
10006/// adrp x0, _var@TLVPPAGE
10007/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10008/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10009/// ; the function pointer
10010/// blr x1 ; Uses descriptor address in x0
10011/// ; Address of _var is now in x0.
10012///
10013/// If the address of _var's descriptor *is* known to the linker, then it can
10014/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10015/// a slight efficiency gain.
10016SDValue
10017AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10018 SelectionDAG &DAG) const {
10019 assert(Subtarget->isTargetDarwin() &&
10020 "This function expects a Darwin target");
10021
10022 SDLoc DL(Op);
10023 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10024 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10025 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10026
10027 SDValue TLVPAddr =
10028 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10029 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10030
10031 // The first entry in the descriptor is a function pointer that we must call
10032 // to obtain the address of the variable.
10033 SDValue Chain = DAG.getEntryNode();
10034 SDValue FuncTLVGet = DAG.getLoad(
10035 PtrMemVT, DL, Chain, DescAddr,
10037 Align(PtrMemVT.getSizeInBits() / 8),
10039 Chain = FuncTLVGet.getValue(1);
10040
10041 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10042 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10043
10045 MFI.setAdjustsStack(true);
10046
10047 // TLS calls preserve all registers except those that absolutely must be
10048 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10049 // silly).
10050 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10051 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10052 if (Subtarget->hasCustomCallingConv())
10053 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10054
10055 // Finally, we can make the call. This is just a degenerate version of a
10056 // normal AArch64 call node: x0 takes the address of the descriptor, and
10057 // returns the address of the variable in this thread.
10058 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10059
10060 unsigned Opcode = AArch64ISD::CALL;
10062 Ops.push_back(Chain);
10063 Ops.push_back(FuncTLVGet);
10064
10065 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10066 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10067 Opcode = AArch64ISD::AUTH_CALL;
10068 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10069 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10070 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10071 }
10072
10073 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10074 Ops.push_back(DAG.getRegisterMask(Mask));
10075 Ops.push_back(Chain.getValue(1));
10076 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10077 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10078}
10079
10080/// Convert a thread-local variable reference into a sequence of instructions to
10081/// compute the variable's address for the local exec TLS model of ELF targets.
10082/// The sequence depends on the maximum TLS area size.
10083SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10084 SDValue ThreadBase,
10085 const SDLoc &DL,
10086 SelectionDAG &DAG) const {
10087 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10088 SDValue TPOff, Addr;
10089
10090 switch (DAG.getTarget().Options.TLSSize) {
10091 default:
10092 llvm_unreachable("Unexpected TLS size");
10093
10094 case 12: {
10095 // mrs x0, TPIDR_EL0
10096 // add x0, x0, :tprel_lo12:a
10098 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10099 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10100 Var,
10101 DAG.getTargetConstant(0, DL, MVT::i32)),
10102 0);
10103 }
10104
10105 case 24: {
10106 // mrs x0, TPIDR_EL0
10107 // add x0, x0, :tprel_hi12:a
10108 // add x0, x0, :tprel_lo12_nc:a
10109 SDValue HiVar = DAG.getTargetGlobalAddress(
10110 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10111 SDValue LoVar = DAG.getTargetGlobalAddress(
10112 GV, DL, PtrVT, 0,
10114 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10115 HiVar,
10116 DAG.getTargetConstant(0, DL, MVT::i32)),
10117 0);
10118 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10119 LoVar,
10120 DAG.getTargetConstant(0, DL, MVT::i32)),
10121 0);
10122 }
10123
10124 case 32: {
10125 // mrs x1, TPIDR_EL0
10126 // movz x0, #:tprel_g1:a
10127 // movk x0, #:tprel_g0_nc:a
10128 // add x0, x1, x0
10129 SDValue HiVar = DAG.getTargetGlobalAddress(
10130 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10131 SDValue LoVar = DAG.getTargetGlobalAddress(
10132 GV, DL, PtrVT, 0,
10134 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10135 DAG.getTargetConstant(16, DL, MVT::i32)),
10136 0);
10137 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10138 DAG.getTargetConstant(0, DL, MVT::i32)),
10139 0);
10140 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10141 }
10142
10143 case 48: {
10144 // mrs x1, TPIDR_EL0
10145 // movz x0, #:tprel_g2:a
10146 // movk x0, #:tprel_g1_nc:a
10147 // movk x0, #:tprel_g0_nc:a
10148 // add x0, x1, x0
10149 SDValue HiVar = DAG.getTargetGlobalAddress(
10150 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10151 SDValue MiVar = DAG.getTargetGlobalAddress(
10152 GV, DL, PtrVT, 0,
10154 SDValue LoVar = DAG.getTargetGlobalAddress(
10155 GV, DL, PtrVT, 0,
10157 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10158 DAG.getTargetConstant(32, DL, MVT::i32)),
10159 0);
10160 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10161 DAG.getTargetConstant(16, DL, MVT::i32)),
10162 0);
10163 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10164 DAG.getTargetConstant(0, DL, MVT::i32)),
10165 0);
10166 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10167 }
10168 }
10169}
10170
10171/// When accessing thread-local variables under either the general-dynamic or
10172/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10173/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10174/// is a function pointer to carry out the resolution.
10175///
10176/// The sequence is:
10177/// adrp x0, :tlsdesc:var
10178/// ldr x1, [x0, #:tlsdesc_lo12:var]
10179/// add x0, x0, #:tlsdesc_lo12:var
10180/// .tlsdesccall var
10181/// blr x1
10182/// (TPIDR_EL0 offset now in x0)
10183///
10184/// The above sequence must be produced unscheduled, to enable the linker to
10185/// optimize/relax this sequence.
10186/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10187/// above sequence, and expanded really late in the compilation flow, to ensure
10188/// the sequence is produced as per above.
10189SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10190 const SDLoc &DL,
10191 SelectionDAG &DAG) const {
10192 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10193
10194 SDValue Chain = DAG.getEntryNode();
10195 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10196
10197 unsigned Opcode =
10198 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10201 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10202 SDValue Glue = Chain.getValue(1);
10203
10204 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10205}
10206
10207SDValue
10208AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10209 SelectionDAG &DAG) const {
10210 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10211
10212 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10213 AArch64FunctionInfo *MFI =
10215
10219
10221 if (Model == TLSModel::LocalDynamic)
10223 }
10224
10226 Model != TLSModel::LocalExec)
10227 report_fatal_error("ELF TLS only supported in small memory model or "
10228 "in local exec TLS model");
10229 // Different choices can be made for the maximum size of the TLS area for a
10230 // module. For the small address model, the default TLS size is 16MiB and the
10231 // maximum TLS size is 4GiB.
10232 // FIXME: add tiny and large code model support for TLS access models other
10233 // than local exec. We currently generate the same code as small for tiny,
10234 // which may be larger than needed.
10235
10236 SDValue TPOff;
10237 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10238 SDLoc DL(Op);
10239 const GlobalValue *GV = GA->getGlobal();
10240
10241 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10242
10243 if (Model == TLSModel::LocalExec) {
10244 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10245 } else if (Model == TLSModel::InitialExec) {
10246 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10247 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10248 } else if (Model == TLSModel::LocalDynamic) {
10249 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10250 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10251 // the beginning of the module's TLS region, followed by a DTPREL offset
10252 // calculation.
10253
10254 // These accesses will need deduplicating if there's more than one.
10256
10257 // The call needs a relocation too for linker relaxation. It doesn't make
10258 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10259 // the address.
10260 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10262
10263 // Now we can calculate the offset from TPIDR_EL0 to this module's
10264 // thread-local area.
10265 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10266
10267 // Now use :dtprel_whatever: operations to calculate this variable's offset
10268 // in its thread-storage area.
10269 SDValue HiVar = DAG.getTargetGlobalAddress(
10270 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10271 SDValue LoVar = DAG.getTargetGlobalAddress(
10272 GV, DL, MVT::i64, 0,
10274
10275 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10276 DAG.getTargetConstant(0, DL, MVT::i32)),
10277 0);
10278 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10279 DAG.getTargetConstant(0, DL, MVT::i32)),
10280 0);
10281 } else if (Model == TLSModel::GeneralDynamic) {
10282 // The call needs a relocation too for linker relaxation. It doesn't make
10283 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10284 // the address.
10285 SDValue SymAddr =
10286 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10287
10288 // Finally we can make a call to calculate the offset from tpidr_el0.
10289 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10290 } else
10291 llvm_unreachable("Unsupported ELF TLS access model");
10292
10293 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10294}
10295
10296SDValue
10297AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10298 SelectionDAG &DAG) const {
10299 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10300
10301 SDValue Chain = DAG.getEntryNode();
10302 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10303 SDLoc DL(Op);
10304
10305 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10306
10307 // Load the ThreadLocalStoragePointer from the TEB
10308 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10309 SDValue TLSArray =
10310 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10311 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10312 Chain = TLSArray.getValue(1);
10313
10314 // Load the TLS index from the C runtime;
10315 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10316 // This also does the same as LOADgot, but using a generic i32 load,
10317 // while LOADgot only loads i64.
10318 SDValue TLSIndexHi =
10319 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10320 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10321 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10322 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10323 SDValue TLSIndex =
10324 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10325 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10326 Chain = TLSIndex.getValue(1);
10327
10328 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10329 // offset into the TLSArray.
10330 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10331 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10332 DAG.getConstant(3, DL, PtrVT));
10333 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10334 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10336 Chain = TLS.getValue(1);
10337
10338 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10339 const GlobalValue *GV = GA->getGlobal();
10340 SDValue TGAHi = DAG.getTargetGlobalAddress(
10341 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10342 SDValue TGALo = DAG.getTargetGlobalAddress(
10343 GV, DL, PtrVT, 0,
10345
10346 // Add the offset from the start of the .tls section (section base).
10347 SDValue Addr =
10348 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10349 DAG.getTargetConstant(0, DL, MVT::i32)),
10350 0);
10351 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10352 return Addr;
10353}
10354
10355SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10356 SelectionDAG &DAG) const {
10357 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10358 if (DAG.getTarget().useEmulatedTLS())
10359 return LowerToTLSEmulatedModel(GA, DAG);
10360
10361 if (Subtarget->isTargetDarwin())
10362 return LowerDarwinGlobalTLSAddress(Op, DAG);
10363 if (Subtarget->isTargetELF())
10364 return LowerELFGlobalTLSAddress(Op, DAG);
10365 if (Subtarget->isTargetWindows())
10366 return LowerWindowsGlobalTLSAddress(Op, DAG);
10367
10368 llvm_unreachable("Unexpected platform trying to use TLS");
10369}
10370
10371//===----------------------------------------------------------------------===//
10372// PtrAuthGlobalAddress lowering
10373//
10374// We have 3 lowering alternatives to choose from:
10375// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10376// If the GV doesn't need a GOT load (i.e., is locally defined)
10377// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10378//
10379// - LOADgotPAC: similar to LOADgot, with added PAC.
10380// If the GV needs a GOT load, materialize the pointer using the usual
10381// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10382// section is assumed to be read-only (for example, via relro mechanism). See
10383// LowerMOVaddrPAC.
10384//
10385// - LOADauthptrstatic: similar to LOADgot, but use a
10386// special stub slot instead of a GOT slot.
10387// Load a signed pointer for symbol 'sym' from a stub slot named
10388// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10389// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10390// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10391//
10392// All 3 are pseudos that are expand late to longer sequences: this lets us
10393// provide integrity guarantees on the to-be-signed intermediate values.
10394//
10395// LOADauthptrstatic is undesirable because it requires a large section filled
10396// with often similarly-signed pointers, making it a good harvesting target.
10397// Thus, it's only used for ptrauth references to extern_weak to avoid null
10398// checks.
10399
10401 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10402 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10403 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10404 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10405
10406 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10407 // offset alone as a pointer if the symbol wasn't available, which would
10408 // probably break null checks in users. Ptrauth complicates things further:
10409 // error out.
10410 if (TGN->getOffset() != 0)
10412 "unsupported non-zero offset in weak ptrauth global reference");
10413
10414 if (!isNullConstant(AddrDiscriminator))
10415 report_fatal_error("unsupported weak addr-div ptrauth global");
10416
10417 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10418 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10419 {TGA, Key, Discriminator}),
10420 0);
10421}
10422
10423SDValue
10424AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10425 SelectionDAG &DAG) const {
10426 SDValue Ptr = Op.getOperand(0);
10427 uint64_t KeyC = Op.getConstantOperandVal(1);
10428 SDValue AddrDiscriminator = Op.getOperand(2);
10429 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10430 EVT VT = Op.getValueType();
10431 SDLoc DL(Op);
10432
10433 if (KeyC > AArch64PACKey::LAST)
10434 report_fatal_error("key in ptrauth global out of range [0, " +
10435 Twine((int)AArch64PACKey::LAST) + "]");
10436
10437 // Blend only works if the integer discriminator is 16-bit wide.
10438 if (!isUInt<16>(DiscriminatorC))
10440 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10441
10442 // Choosing between 3 lowering alternatives is target-specific.
10443 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10444 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10445
10446 int64_t PtrOffsetC = 0;
10447 if (Ptr.getOpcode() == ISD::ADD) {
10448 PtrOffsetC = Ptr.getConstantOperandVal(1);
10449 Ptr = Ptr.getOperand(0);
10450 }
10451 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10452 const GlobalValue *PtrGV = PtrN->getGlobal();
10453
10454 // Classify the reference to determine whether it needs a GOT load.
10455 const unsigned OpFlags =
10456 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10457 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10458 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10459 "unsupported non-GOT op flags on ptrauth global reference");
10460
10461 // Fold any offset into the GV; our pseudos expect it there.
10462 PtrOffsetC += PtrN->getOffset();
10463 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10464 /*TargetFlags=*/0);
10465 assert(PtrN->getTargetFlags() == 0 &&
10466 "unsupported target flags on ptrauth global");
10467
10468 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10469 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10470 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10471 ? AddrDiscriminator
10472 : DAG.getRegister(AArch64::XZR, MVT::i64);
10473
10474 // No GOT load needed -> MOVaddrPAC
10475 if (!NeedsGOTLoad) {
10476 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10477 return SDValue(
10478 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10479 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10480 0);
10481 }
10482
10483 // GOT load -> LOADgotPAC
10484 // Note that we disallow extern_weak refs to avoid null checks later.
10485 if (!PtrGV->hasExternalWeakLinkage())
10486 return SDValue(
10487 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10488 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10489 0);
10490
10491 // extern_weak ref -> LOADauthptrstatic
10493 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10494 DAG);
10495}
10496
10497// Looks through \param Val to determine the bit that can be used to
10498// check the sign of the value. It returns the unextended value and
10499// the sign bit position.
10500std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10501 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10502 return {Val.getOperand(0),
10503 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10504 1};
10505
10506 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10507 return {Val.getOperand(0),
10508 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10509
10510 return {Val, Val.getValueSizeInBits() - 1};
10511}
10512
10513SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10514 SDValue Chain = Op.getOperand(0);
10515 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10516 SDValue LHS = Op.getOperand(2);
10517 SDValue RHS = Op.getOperand(3);
10518 SDValue Dest = Op.getOperand(4);
10519 SDLoc dl(Op);
10520
10522 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10523 // will not be produced, as they are conditional branch instructions that do
10524 // not set flags.
10525 bool ProduceNonFlagSettingCondBr =
10526 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10527
10528 // Handle f128 first, since lowering it will result in comparing the return
10529 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10530 // is expecting to deal with.
10531 if (LHS.getValueType() == MVT::f128) {
10532 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10533
10534 // If softenSetCCOperands returned a scalar, we need to compare the result
10535 // against zero to select between true and false values.
10536 if (!RHS.getNode()) {
10537 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10538 CC = ISD::SETNE;
10539 }
10540 }
10541
10542 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10543 // instruction.
10544 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
10545 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10546 // Only lower legal XALUO ops.
10547 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10548 return SDValue();
10549
10550 // The actual operation with overflow check.
10552 SDValue Value, Overflow;
10553 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10554
10555 if (CC == ISD::SETNE)
10556 OFCC = getInvertedCondCode(OFCC);
10557 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
10558
10559 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10560 Overflow);
10561 }
10562
10563 if (LHS.getValueType().isInteger()) {
10564 assert((LHS.getValueType() == RHS.getValueType()) &&
10565 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10566
10567 // If the RHS of the comparison is zero, we can potentially fold this
10568 // to a specialized branch.
10569 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10570 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10571 if (CC == ISD::SETEQ) {
10572 // See if we can use a TBZ to fold in an AND as well.
10573 // TBZ has a smaller branch displacement than CBZ. If the offset is
10574 // out of bounds, a late MI-layer pass rewrites branches.
10575 // 403.gcc is an example that hits this case.
10576 if (LHS.getOpcode() == ISD::AND &&
10577 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10578 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10579 SDValue Test = LHS.getOperand(0);
10580 uint64_t Mask = LHS.getConstantOperandVal(1);
10581 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
10582 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10583 Dest);
10584 }
10585
10586 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
10587 } else if (CC == ISD::SETNE) {
10588 // See if we can use a TBZ to fold in an AND as well.
10589 // TBZ has a smaller branch displacement than CBZ. If the offset is
10590 // out of bounds, a late MI-layer pass rewrites branches.
10591 // 403.gcc is an example that hits this case.
10592 if (LHS.getOpcode() == ISD::AND &&
10593 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10594 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10595 SDValue Test = LHS.getOperand(0);
10596 uint64_t Mask = LHS.getConstantOperandVal(1);
10597 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
10598 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
10599 Dest);
10600 }
10601
10602 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
10603 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10604 // Don't combine AND since emitComparison converts the AND to an ANDS
10605 // (a.k.a. TST) and the test in the test bit and branch instruction
10606 // becomes redundant. This would also increase register pressure.
10607 uint64_t SignBitPos;
10608 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10609 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
10610 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10611 }
10612 }
10613 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10614 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10615 // Don't combine AND since emitComparison converts the AND to an ANDS
10616 // (a.k.a. TST) and the test in the test bit and branch instruction
10617 // becomes redundant. This would also increase register pressure.
10618 uint64_t SignBitPos;
10619 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10620 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
10621 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
10622 }
10623
10624 SDValue CCVal;
10625 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10626 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
10627 Cmp);
10628 }
10629
10630 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
10631 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10632
10633 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10634 // clean. Some of them require two branches to implement.
10635 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10636 AArch64CC::CondCode CC1, CC2;
10637 changeFPCCToAArch64CC(CC, CC1, CC2);
10638 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10639 SDValue BR1 =
10640 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
10641 if (CC2 != AArch64CC::AL) {
10642 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10643 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
10644 Cmp);
10645 }
10646
10647 return BR1;
10648}
10649
10650SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10651 SelectionDAG &DAG) const {
10652 if (!Subtarget->isNeonAvailable() &&
10653 !Subtarget->useSVEForFixedLengthVectors())
10654 return SDValue();
10655
10656 EVT VT = Op.getValueType();
10657 EVT IntVT = VT.changeTypeToInteger();
10658 SDLoc DL(Op);
10659
10660 SDValue In1 = Op.getOperand(0);
10661 SDValue In2 = Op.getOperand(1);
10662 EVT SrcVT = In2.getValueType();
10663
10664 if (!SrcVT.bitsEq(VT))
10665 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
10666
10667 if (VT.isScalableVector())
10668 IntVT =
10670
10671 if (VT.isFixedLengthVector() &&
10672 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
10673 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10674
10675 In1 = convertToScalableVector(DAG, ContainerVT, In1);
10676 In2 = convertToScalableVector(DAG, ContainerVT, In2);
10677
10678 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
10679 return convertFromScalableVector(DAG, VT, Res);
10680 }
10681
10682 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10683 if (VT.isScalableVector())
10684 return getSVESafeBitCast(VT, Op, DAG);
10685
10686 return DAG.getBitcast(VT, Op);
10687 };
10688
10689 SDValue VecVal1, VecVal2;
10690 EVT VecVT;
10691 auto SetVecVal = [&](int Idx = -1) {
10692 if (!VT.isVector()) {
10693 VecVal1 =
10694 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
10695 VecVal2 =
10696 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
10697 } else {
10698 VecVal1 = BitCast(VecVT, In1, DAG);
10699 VecVal2 = BitCast(VecVT, In2, DAG);
10700 }
10701 };
10702 if (VT.isVector()) {
10703 VecVT = IntVT;
10704 SetVecVal();
10705 } else if (VT == MVT::f64) {
10706 VecVT = MVT::v2i64;
10707 SetVecVal(AArch64::dsub);
10708 } else if (VT == MVT::f32) {
10709 VecVT = MVT::v4i32;
10710 SetVecVal(AArch64::ssub);
10711 } else if (VT == MVT::f16 || VT == MVT::bf16) {
10712 VecVT = MVT::v8i16;
10713 SetVecVal(AArch64::hsub);
10714 } else {
10715 llvm_unreachable("Invalid type for copysign!");
10716 }
10717
10718 unsigned BitWidth = In1.getScalarValueSizeInBits();
10719 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10720
10721 // We want to materialize a mask with every bit but the high bit set, but the
10722 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10723 // 64-bit elements. Instead, materialize all bits set and then negate that.
10724 if (VT == MVT::f64 || VT == MVT::v2f64) {
10725 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10726 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10727 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10728 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10729 }
10730
10731 SDValue BSP =
10732 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10733 if (VT == MVT::f16 || VT == MVT::bf16)
10734 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10735 if (VT == MVT::f32)
10736 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10737 if (VT == MVT::f64)
10738 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10739
10740 return BitCast(VT, BSP, DAG);
10741}
10742
10743SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10744 SelectionDAG &DAG) const {
10746 Attribute::NoImplicitFloat))
10747 return SDValue();
10748
10749 EVT VT = Op.getValueType();
10750 if (VT.isScalableVector() ||
10752 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10753
10754 if (!Subtarget->isNeonAvailable())
10755 return SDValue();
10756
10757 bool IsParity = Op.getOpcode() == ISD::PARITY;
10758 SDValue Val = Op.getOperand(0);
10759 SDLoc DL(Op);
10760
10761 // for i32, general parity function using EORs is more efficient compared to
10762 // using floating point
10763 if (VT == MVT::i32 && IsParity)
10764 return SDValue();
10765
10766 // If there is no CNT instruction available, GPR popcount can
10767 // be more efficiently lowered to the following sequence that uses
10768 // AdvSIMD registers/instructions as long as the copies to/from
10769 // the AdvSIMD registers are cheap.
10770 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10771 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10772 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10773 // FMOV X0, D0 // copy result back to integer reg
10774 if (VT == MVT::i32 || VT == MVT::i64) {
10775 if (VT == MVT::i32)
10776 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10777 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10778
10779 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10780 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10781 if (VT == MVT::i32)
10782 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
10783 DAG.getConstant(0, DL, MVT::i64));
10784 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10785 if (IsParity)
10786 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10787 return AddV;
10788 } else if (VT == MVT::i128) {
10789 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10790
10791 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10792 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10793 AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
10794 if (IsParity)
10795 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10796 return AddV;
10797 }
10798
10799 assert(!IsParity && "ISD::PARITY of vector types not supported");
10800
10801 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10802 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10803 "Unexpected type for custom ctpop lowering");
10804
10805 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10806 Val = DAG.getBitcast(VT8Bit, Val);
10807 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10808
10809 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10810 VT.getVectorNumElements() >= 2) {
10811 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10812 SDValue Zeros = DAG.getConstant(0, DL, DT);
10813 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10814
10815 if (VT == MVT::v2i64) {
10816 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10817 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10818 } else if (VT == MVT::v2i32) {
10819 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10820 } else if (VT == MVT::v4i32) {
10821 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10822 } else {
10823 llvm_unreachable("Unexpected type for custom ctpop lowering");
10824 }
10825
10826 return Val;
10827 }
10828
10829 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10830 unsigned EltSize = 8;
10831 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10832 while (EltSize != VT.getScalarSizeInBits()) {
10833 EltSize *= 2;
10834 NumElts /= 2;
10835 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
10836 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
10837 }
10838
10839 return Val;
10840}
10841
10842SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10843 EVT VT = Op.getValueType();
10844 assert(VT.isScalableVector() ||
10846 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10847
10848 SDLoc DL(Op);
10849 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
10850 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
10851}
10852
10853SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10854 SelectionDAG &DAG) const {
10855
10856 EVT VT = Op.getValueType();
10857 SDLoc DL(Op);
10858 unsigned Opcode = Op.getOpcode();
10860 switch (Opcode) {
10861 default:
10862 llvm_unreachable("Wrong instruction");
10863 case ISD::SMAX:
10864 CC = ISD::SETGT;
10865 break;
10866 case ISD::SMIN:
10867 CC = ISD::SETLT;
10868 break;
10869 case ISD::UMAX:
10870 CC = ISD::SETUGT;
10871 break;
10872 case ISD::UMIN:
10873 CC = ISD::SETULT;
10874 break;
10875 }
10876
10877 if (VT.isScalableVector() ||
10879 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10880 switch (Opcode) {
10881 default:
10882 llvm_unreachable("Wrong instruction");
10883 case ISD::SMAX:
10884 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10885 case ISD::SMIN:
10886 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10887 case ISD::UMAX:
10888 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10889 case ISD::UMIN:
10890 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10891 }
10892 }
10893
10894 SDValue Op0 = Op.getOperand(0);
10895 SDValue Op1 = Op.getOperand(1);
10896 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10897 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10898}
10899
10900SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10901 SelectionDAG &DAG) const {
10902 EVT VT = Op.getValueType();
10903
10904 if (VT.isScalableVector() ||
10906 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10907 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10908
10909 SDLoc DL(Op);
10910 SDValue REVB;
10911 MVT VST;
10912
10913 switch (VT.getSimpleVT().SimpleTy) {
10914 default:
10915 llvm_unreachable("Invalid type for bitreverse!");
10916
10917 case MVT::v2i32: {
10918 VST = MVT::v8i8;
10919 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10920
10921 break;
10922 }
10923
10924 case MVT::v4i32: {
10925 VST = MVT::v16i8;
10926 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10927
10928 break;
10929 }
10930
10931 case MVT::v1i64: {
10932 VST = MVT::v8i8;
10933 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10934
10935 break;
10936 }
10937
10938 case MVT::v2i64: {
10939 VST = MVT::v16i8;
10940 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10941
10942 break;
10943 }
10944 }
10945
10946 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10947 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10948}
10949
10950// Check whether the continuous comparison sequence.
10951static bool
10952isOrXorChain(SDValue N, unsigned &Num,
10953 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10954 if (Num == MaxXors)
10955 return false;
10956
10957 // Skip the one-use zext
10958 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10959 N = N->getOperand(0);
10960
10961 // The leaf node must be XOR
10962 if (N->getOpcode() == ISD::XOR) {
10963 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10964 Num++;
10965 return true;
10966 }
10967
10968 // All the non-leaf nodes must be OR.
10969 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10970 return false;
10971
10972 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10973 isOrXorChain(N->getOperand(1), Num, WorkList))
10974 return true;
10975 return false;
10976}
10977
10978// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10980 SDValue LHS = N->getOperand(0);
10981 SDValue RHS = N->getOperand(1);
10982 SDLoc DL(N);
10983 EVT VT = N->getValueType(0);
10985
10986 // Only handle integer compares.
10987 if (N->getOpcode() != ISD::SETCC)
10988 return SDValue();
10989
10990 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10991 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10992 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10993 unsigned NumXors = 0;
10994 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10995 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10996 isOrXorChain(LHS, NumXors, WorkList)) {
10997 SDValue XOR0, XOR1;
10998 std::tie(XOR0, XOR1) = WorkList[0];
10999 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11000 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11001 for (unsigned I = 1; I < WorkList.size(); I++) {
11002 std::tie(XOR0, XOR1) = WorkList[I];
11003 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11004 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11005 }
11006
11007 // Exit early by inverting the condition, which help reduce indentations.
11008 return Cmp;
11009 }
11010
11011 return SDValue();
11012}
11013
11014SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11015
11016 if (Op.getValueType().isVector())
11017 return LowerVSETCC(Op, DAG);
11018
11019 bool IsStrict = Op->isStrictFPOpcode();
11020 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11021 unsigned OpNo = IsStrict ? 1 : 0;
11022 SDValue Chain;
11023 if (IsStrict)
11024 Chain = Op.getOperand(0);
11025 SDValue LHS = Op.getOperand(OpNo + 0);
11026 SDValue RHS = Op.getOperand(OpNo + 1);
11027 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11028 SDLoc dl(Op);
11029
11030 // We chose ZeroOrOneBooleanContents, so use zero and one.
11031 EVT VT = Op.getValueType();
11032 SDValue TVal = DAG.getConstant(1, dl, VT);
11033 SDValue FVal = DAG.getConstant(0, dl, VT);
11034
11035 // Handle f128 first, since one possible outcome is a normal integer
11036 // comparison which gets picked up by the next if statement.
11037 if (LHS.getValueType() == MVT::f128) {
11038 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
11039 IsSignaling);
11040
11041 // If softenSetCCOperands returned a scalar, use it.
11042 if (!RHS.getNode()) {
11043 assert(LHS.getValueType() == Op.getValueType() &&
11044 "Unexpected setcc expansion!");
11045 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
11046 }
11047 }
11048
11049 if (LHS.getValueType().isInteger()) {
11050
11051 simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
11052
11053 SDValue CCVal;
11055 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
11056
11057 // Note that we inverted the condition above, so we reverse the order of
11058 // the true and false operands here. This will allow the setcc to be
11059 // matched to a single CSINC instruction.
11060 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
11061 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
11062 }
11063
11064 // Now we know we're dealing with FP values.
11065 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11066 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11067
11068 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11069 // and do the comparison.
11070 SDValue Cmp;
11071 if (IsStrict)
11072 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
11073 else
11074 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11075
11076 AArch64CC::CondCode CC1, CC2;
11077 changeFPCCToAArch64CC(CC, CC1, CC2);
11078 SDValue Res;
11079 if (CC2 == AArch64CC::AL) {
11080 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11081 CC2);
11082 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11083
11084 // Note that we inverted the condition above, so we reverse the order of
11085 // the true and false operands here. This will allow the setcc to be
11086 // matched to a single CSINC instruction.
11087 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
11088 } else {
11089 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11090 // totally clean. Some of them require two CSELs to implement. As is in
11091 // this case, we emit the first CSEL and then emit a second using the output
11092 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11093
11094 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11095 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11096 SDValue CS1 =
11097 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11098
11099 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11100 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11101 }
11102 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
11103}
11104
11105SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11106 SelectionDAG &DAG) const {
11107
11108 SDValue LHS = Op.getOperand(0);
11109 SDValue RHS = Op.getOperand(1);
11110 EVT VT = LHS.getValueType();
11111 if (VT != MVT::i32 && VT != MVT::i64)
11112 return SDValue();
11113
11114 SDLoc DL(Op);
11115 SDValue Carry = Op.getOperand(2);
11116 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11117 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11118 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
11119 LHS, RHS, InvCarry);
11120
11121 EVT OpVT = Op.getValueType();
11122 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11123 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11124
11125 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11127 SDValue CCVal =
11128 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
11129 // Inputs are swapped because the condition is inverted. This will allow
11130 // matching with a single CSINC instruction.
11131 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11132 Cmp.getValue(1));
11133}
11134
11135SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
11136 SDValue RHS, SDValue TVal,
11137 SDValue FVal, const SDLoc &dl,
11138 SelectionDAG &DAG) const {
11139 // Handle f128 first, because it will result in a comparison of some RTLIB
11140 // call result against zero.
11141 if (LHS.getValueType() == MVT::f128) {
11142 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
11143
11144 // If softenSetCCOperands returned a scalar, we need to compare the result
11145 // against zero to select between true and false values.
11146 if (!RHS.getNode()) {
11147 RHS = DAG.getConstant(0, dl, LHS.getValueType());
11148 CC = ISD::SETNE;
11149 }
11150 }
11151
11152 // Also handle f16, for which we need to do a f32 comparison.
11153 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11154 LHS.getValueType() == MVT::bf16) {
11155 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
11156 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
11157 }
11158
11159 // Next, handle integers.
11160 if (LHS.getValueType().isInteger()) {
11161 assert((LHS.getValueType() == RHS.getValueType()) &&
11162 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11163
11164 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11165 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11166 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11167 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11168 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11169 // supported types.
11170 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11171 CTVal->isOne() && CFVal->isAllOnes() &&
11172 LHS.getValueType() == TVal.getValueType()) {
11173 EVT VT = LHS.getValueType();
11174 SDValue Shift =
11175 DAG.getNode(ISD::SRA, dl, VT, LHS,
11176 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11177 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
11178 }
11179
11180 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11181 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11182 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11183 // Both require less instructions than compare and conditional select.
11184 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11185 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11186 LHS.getValueType() == RHS.getValueType()) {
11187 EVT VT = LHS.getValueType();
11188 SDValue Shift =
11189 DAG.getNode(ISD::SRA, dl, VT, LHS,
11190 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
11191
11192 if (CC == ISD::SETGT)
11193 Shift = DAG.getNOT(dl, Shift, VT);
11194
11195 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
11196 }
11197
11198 unsigned Opcode = AArch64ISD::CSEL;
11199
11200 // If both the TVal and the FVal are constants, see if we can swap them in
11201 // order to for a CSINV or CSINC out of them.
11202 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11203 std::swap(TVal, FVal);
11204 std::swap(CTVal, CFVal);
11205 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11206 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11207 std::swap(TVal, FVal);
11208 std::swap(CTVal, CFVal);
11209 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11210 } else if (TVal.getOpcode() == ISD::XOR) {
11211 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11212 // with a CSINV rather than a CSEL.
11213 if (isAllOnesConstant(TVal.getOperand(1))) {
11214 std::swap(TVal, FVal);
11215 std::swap(CTVal, CFVal);
11216 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11217 }
11218 } else if (TVal.getOpcode() == ISD::SUB) {
11219 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11220 // that we can match with a CSNEG rather than a CSEL.
11221 if (isNullConstant(TVal.getOperand(0))) {
11222 std::swap(TVal, FVal);
11223 std::swap(CTVal, CFVal);
11224 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11225 }
11226 } else if (CTVal && CFVal) {
11227 const int64_t TrueVal = CTVal->getSExtValue();
11228 const int64_t FalseVal = CFVal->getSExtValue();
11229 bool Swap = false;
11230
11231 // If both TVal and FVal are constants, see if FVal is the
11232 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11233 // instead of a CSEL in that case.
11234 if (TrueVal == ~FalseVal) {
11235 Opcode = AArch64ISD::CSINV;
11236 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11237 TrueVal == -FalseVal) {
11238 Opcode = AArch64ISD::CSNEG;
11239 } else if (TVal.getValueType() == MVT::i32) {
11240 // If our operands are only 32-bit wide, make sure we use 32-bit
11241 // arithmetic for the check whether we can use CSINC. This ensures that
11242 // the addition in the check will wrap around properly in case there is
11243 // an overflow (which would not be the case if we do the check with
11244 // 64-bit arithmetic).
11245 const uint32_t TrueVal32 = CTVal->getZExtValue();
11246 const uint32_t FalseVal32 = CFVal->getZExtValue();
11247
11248 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11249 Opcode = AArch64ISD::CSINC;
11250
11251 if (TrueVal32 > FalseVal32) {
11252 Swap = true;
11253 }
11254 }
11255 } else {
11256 // 64-bit check whether we can use CSINC.
11257 const uint64_t TrueVal64 = TrueVal;
11258 const uint64_t FalseVal64 = FalseVal;
11259
11260 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11261 Opcode = AArch64ISD::CSINC;
11262
11263 if (TrueVal > FalseVal) {
11264 Swap = true;
11265 }
11266 }
11267 }
11268
11269 // Swap TVal and FVal if necessary.
11270 if (Swap) {
11271 std::swap(TVal, FVal);
11272 std::swap(CTVal, CFVal);
11273 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11274 }
11275
11276 if (Opcode != AArch64ISD::CSEL) {
11277 // Drop FVal since we can get its value by simply inverting/negating
11278 // TVal.
11279 FVal = TVal;
11280 }
11281 }
11282
11283 // Avoid materializing a constant when possible by reusing a known value in
11284 // a register. However, don't perform this optimization if the known value
11285 // is one, zero or negative one in the case of a CSEL. We can always
11286 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11287 // FVal, respectively.
11288 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11289 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11290 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11292 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11293 // "a != C ? x : a" to avoid materializing C.
11294 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11295 TVal = LHS;
11296 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11297 FVal = LHS;
11298 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11299 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11300 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11301 // avoid materializing C.
11303 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11304 Opcode = AArch64ISD::CSINV;
11305 TVal = LHS;
11306 FVal = DAG.getConstant(0, dl, FVal.getValueType());
11307 }
11308 }
11309
11310 SDValue CCVal;
11311 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
11312 EVT VT = TVal.getValueType();
11313 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
11314 }
11315
11316 // Now we know we're dealing with FP values.
11317 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11318 LHS.getValueType() == MVT::f64);
11319 assert(LHS.getValueType() == RHS.getValueType());
11320 EVT VT = TVal.getValueType();
11321 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
11322
11323 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11324 // clean. Some of them require two CSELs to implement.
11325 AArch64CC::CondCode CC1, CC2;
11326 changeFPCCToAArch64CC(CC, CC1, CC2);
11327
11328 if (DAG.getTarget().Options.UnsafeFPMath) {
11329 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11330 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11331 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11332 if (RHSVal && RHSVal->isZero()) {
11333 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11334 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11335
11336 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11337 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11338 TVal = LHS;
11339 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11340 CFVal && CFVal->isZero() &&
11341 FVal.getValueType() == LHS.getValueType())
11342 FVal = LHS;
11343 }
11344 }
11345
11346 // Emit first, and possibly only, CSEL.
11347 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
11348 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
11349
11350 // If we need a second CSEL, emit it, using the output of the first as the
11351 // RHS. We're effectively OR'ing the two CC's together.
11352 if (CC2 != AArch64CC::AL) {
11353 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
11354 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
11355 }
11356
11357 // Otherwise, return the output of the first CSEL.
11358 return CS1;
11359}
11360
11361SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11362 SelectionDAG &DAG) const {
11363 EVT Ty = Op.getValueType();
11364 auto Idx = Op.getConstantOperandAPInt(2);
11365 int64_t IdxVal = Idx.getSExtValue();
11366 assert(Ty.isScalableVector() &&
11367 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11368
11369 // We can use the splice instruction for certain index values where we are
11370 // able to efficiently generate the correct predicate. The index will be
11371 // inverted and used directly as the input to the ptrue instruction, i.e.
11372 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11373 // splice predicate. However, we can only do this if we can guarantee that
11374 // there are enough elements in the vector, hence we check the index <= min
11375 // number of elements.
11376 std::optional<unsigned> PredPattern;
11377 if (Ty.isScalableVector() && IdxVal < 0 &&
11378 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11379 std::nullopt) {
11380 SDLoc DL(Op);
11381
11382 // Create a predicate where all but the last -IdxVal elements are false.
11383 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11384 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11385 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11386
11387 // Now splice the two inputs together using the predicate.
11388 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11389 Op.getOperand(1));
11390 }
11391
11392 // We can select to an EXT instruction when indexing the first 256 bytes.
11394 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11395 return Op;
11396
11397 return SDValue();
11398}
11399
11400SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11401 SelectionDAG &DAG) const {
11402 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11403 SDValue LHS = Op.getOperand(0);
11404 SDValue RHS = Op.getOperand(1);
11405 SDValue TVal = Op.getOperand(2);
11406 SDValue FVal = Op.getOperand(3);
11407 SDLoc DL(Op);
11408 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11409}
11410
11411SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11412 SelectionDAG &DAG) const {
11413 SDValue CCVal = Op->getOperand(0);
11414 SDValue TVal = Op->getOperand(1);
11415 SDValue FVal = Op->getOperand(2);
11416 SDLoc DL(Op);
11417
11418 EVT Ty = Op.getValueType();
11419 if (Ty == MVT::aarch64svcount) {
11420 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
11421 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
11422 SDValue Sel =
11423 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
11424 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
11425 }
11426
11427 if (Ty.isScalableVector()) {
11428 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
11429 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
11430 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11431 }
11432
11433 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
11434 // FIXME: Ideally this would be the same as above using i1 types, however
11435 // for the moment we can't deal with fixed i1 vector types properly, so
11436 // instead extend the predicate to a result type sized integer vector.
11437 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
11438 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
11439 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
11440 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
11441 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
11442 }
11443
11444 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
11445 // instruction.
11446 if (ISD::isOverflowIntrOpRes(CCVal)) {
11447 // Only lower legal XALUO ops.
11448 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
11449 return SDValue();
11450
11452 SDValue Value, Overflow;
11453 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
11454 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
11455
11456 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
11457 CCVal, Overflow);
11458 }
11459
11460 // Lower it the same way as we would lower a SELECT_CC node.
11462 SDValue LHS, RHS;
11463 if (CCVal.getOpcode() == ISD::SETCC) {
11464 LHS = CCVal.getOperand(0);
11465 RHS = CCVal.getOperand(1);
11466 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
11467 } else {
11468 LHS = CCVal;
11469 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
11470 CC = ISD::SETNE;
11471 }
11472
11473 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11474 // order to use FCSELSrrr
11475 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11476 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11477 DAG.getUNDEF(MVT::f32), TVal);
11478 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
11479 DAG.getUNDEF(MVT::f32), FVal);
11480 }
11481
11482 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11483
11484 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11485 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
11486 }
11487
11488 return Res;
11489}
11490
11491SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11492 SelectionDAG &DAG) const {
11493 // Jump table entries as PC relative offsets. No additional tweaking
11494 // is necessary here. Just get the address of the jump table.
11495 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
11496
11499 !Subtarget->isTargetMachO())
11500 return getAddrLarge(JT, DAG);
11501 if (CM == CodeModel::Tiny)
11502 return getAddrTiny(JT, DAG);
11503 return getAddr(JT, DAG);
11504}
11505
11506SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11507 SelectionDAG &DAG) const {
11508 // Jump table entries as PC relative offsets. No additional tweaking
11509 // is necessary here. Just get the address of the jump table.
11510 SDLoc DL(Op);
11511 SDValue JT = Op.getOperand(1);
11512 SDValue Entry = Op.getOperand(2);
11513 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
11514
11515 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11516 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
11517
11518 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
11519 // sequence later, to guarantee the integrity of the intermediate values.
11521 "aarch64-jump-table-hardening")) {
11523 if (Subtarget->isTargetMachO()) {
11524 if (CM != CodeModel::Small && CM != CodeModel::Large)
11525 report_fatal_error("Unsupported code-model for hardened jump-table");
11526 } else {
11527 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11528 assert(Subtarget->isTargetELF() &&
11529 "jump table hardening only supported on MachO/ELF");
11530 if (CM != CodeModel::Small)
11531 report_fatal_error("Unsupported code-model for hardened jump-table");
11532 }
11533
11534 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
11535 Entry, SDValue());
11536 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
11537 DAG.getTargetJumpTable(JTI, MVT::i32),
11538 X16Copy.getValue(0), X16Copy.getValue(1));
11539 return SDValue(B, 0);
11540 }
11541
11542 SDNode *Dest =
11543 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
11544 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
11545 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
11546 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
11547}
11548
11549SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11550 SDValue Chain = Op.getOperand(0);
11551 SDValue Dest = Op.getOperand(1);
11552
11553 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11554 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11555 if (Dest->isMachineOpcode() &&
11556 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
11557 return SDValue();
11558
11559 const MachineFunction &MF = DAG.getMachineFunction();
11560 std::optional<uint16_t> BADisc =
11562 if (!BADisc)
11563 return SDValue();
11564
11565 SDLoc DL(Op);
11566
11567 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11569 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11570
11571 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
11572 {Dest, Key, Disc, AddrDisc, Chain});
11573 return SDValue(BrA, 0);
11574}
11575
11576SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11577 SelectionDAG &DAG) const {
11578 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
11580 if (CM == CodeModel::Large) {
11581 // Use the GOT for the large code model on iOS.
11582 if (Subtarget->isTargetMachO()) {
11583 return getGOT(CP, DAG);
11584 }
11586 return getAddrLarge(CP, DAG);
11587 } else if (CM == CodeModel::Tiny) {
11588 return getAddrTiny(CP, DAG);
11589 }
11590 return getAddr(CP, DAG);
11591}
11592
11593SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11594 SelectionDAG &DAG) const {
11595 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
11596 const BlockAddress *BA = BAN->getBlockAddress();
11597
11598 if (std::optional<uint16_t> BADisc =
11600 *BA->getFunction())) {
11601 SDLoc DL(Op);
11602
11603 // This isn't cheap, but BRIND is rare.
11604 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
11605
11606 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
11607
11609 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
11610
11611 SDNode *MOV =
11612 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
11613 {TargetBA, Key, AddrDisc, Disc});
11614 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
11615 SDValue(MOV, 1));
11616 }
11617
11619 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11621 return getAddrLarge(BAN, DAG);
11622 } else if (CM == CodeModel::Tiny) {
11623 return getAddrTiny(BAN, DAG);
11624 }
11625 return getAddr(BAN, DAG);
11626}
11627
11628SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11629 SelectionDAG &DAG) const {
11630 AArch64FunctionInfo *FuncInfo =
11632
11633 SDLoc DL(Op);
11634 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
11636 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
11637 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11638 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11639 MachinePointerInfo(SV));
11640}
11641
11642SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11643 SelectionDAG &DAG) const {
11646
11647 SDLoc DL(Op);
11648 SDValue FR;
11649 if (Subtarget->isWindowsArm64EC()) {
11650 // With the Arm64EC ABI, we compute the address of the varargs save area
11651 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11652 // but calls from an entry thunk can pass in a different address.
11653 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
11654 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
11656 if (FuncInfo->getVarArgsGPRSize() > 0)
11657 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11658 else
11659 StackOffset = FuncInfo->getVarArgsStackOffset();
11660 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
11661 DAG.getConstant(StackOffset, DL, MVT::i64));
11662 } else {
11663 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
11664 ? FuncInfo->getVarArgsGPRIndex()
11665 : FuncInfo->getVarArgsStackIndex(),
11667 }
11668 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11669 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
11670 MachinePointerInfo(SV));
11671}
11672
11673SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11674 SelectionDAG &DAG) const {
11675 // The layout of the va_list struct is specified in the AArch64 Procedure Call
11676 // Standard, section B.3.
11679 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11680 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11681 auto PtrVT = getPointerTy(DAG.getDataLayout());
11682 SDLoc DL(Op);
11683
11684 SDValue Chain = Op.getOperand(0);
11685 SDValue VAList = Op.getOperand(1);
11686 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11688
11689 // void *__stack at offset 0
11690 unsigned Offset = 0;
11691 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
11692 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
11693 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
11694 MachinePointerInfo(SV), Align(PtrSize)));
11695
11696 // void *__gr_top at offset 8 (4 on ILP32)
11697 Offset += PtrSize;
11698 int GPRSize = FuncInfo->getVarArgsGPRSize();
11699 if (GPRSize > 0) {
11700 SDValue GRTop, GRTopAddr;
11701
11702 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11703 DAG.getConstant(Offset, DL, PtrVT));
11704
11705 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
11706 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
11707 DAG.getSignedConstant(GPRSize, DL, PtrVT));
11708 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
11709
11710 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
11712 Align(PtrSize)));
11713 }
11714
11715 // void *__vr_top at offset 16 (8 on ILP32)
11716 Offset += PtrSize;
11717 int FPRSize = FuncInfo->getVarArgsFPRSize();
11718 if (FPRSize > 0) {
11719 SDValue VRTop, VRTopAddr;
11720 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11721 DAG.getConstant(Offset, DL, PtrVT));
11722
11723 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
11724 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
11725 DAG.getSignedConstant(FPRSize, DL, PtrVT));
11726 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
11727
11728 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
11730 Align(PtrSize)));
11731 }
11732
11733 // int __gr_offs at offset 24 (12 on ILP32)
11734 Offset += PtrSize;
11735 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11736 DAG.getConstant(Offset, DL, PtrVT));
11737 MemOps.push_back(
11738 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
11739 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11740
11741 // int __vr_offs at offset 28 (16 on ILP32)
11742 Offset += 4;
11743 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11744 DAG.getConstant(Offset, DL, PtrVT));
11745 MemOps.push_back(
11746 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
11747 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11748
11749 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
11750}
11751
11752SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11753 SelectionDAG &DAG) const {
11755 Function &F = MF.getFunction();
11756
11757 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
11758 return LowerWin64_VASTART(Op, DAG);
11759 else if (Subtarget->isTargetDarwin())
11760 return LowerDarwin_VASTART(Op, DAG);
11761 else
11762 return LowerAAPCS_VASTART(Op, DAG);
11763}
11764
11765SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11766 SelectionDAG &DAG) const {
11767 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11768 // pointer.
11769 SDLoc DL(Op);
11770 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11771 unsigned VaListSize =
11772 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11773 ? PtrSize
11774 : Subtarget->isTargetILP32() ? 20 : 32;
11775 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
11776 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11777
11778 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
11779 DAG.getConstant(VaListSize, DL, MVT::i32),
11780 Align(PtrSize), false, false, /*CI=*/nullptr,
11781 std::nullopt, MachinePointerInfo(DestSV),
11782 MachinePointerInfo(SrcSV));
11783}
11784
11785SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11786 assert(Subtarget->isTargetDarwin() &&
11787 "automatic va_arg instruction only works on Darwin");
11788
11789 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11790 EVT VT = Op.getValueType();
11791 SDLoc DL(Op);
11792 SDValue Chain = Op.getOperand(0);
11793 SDValue Addr = Op.getOperand(1);
11794 MaybeAlign Align(Op.getConstantOperandVal(3));
11795 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11796 auto PtrVT = getPointerTy(DAG.getDataLayout());
11797 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11798 SDValue VAList =
11799 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
11800 Chain = VAList.getValue(1);
11801 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
11802
11803 if (VT.isScalableVector())
11804 report_fatal_error("Passing SVE types to variadic functions is "
11805 "currently not supported");
11806
11807 if (Align && *Align > MinSlotSize) {
11808 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11809 DAG.getConstant(Align->value() - 1, DL, PtrVT));
11810 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
11811 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
11812 }
11813
11814 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
11815 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
11816
11817 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11818 // up to 64 bits. At the very least, we have to increase the striding of the
11819 // vaargs list to match this, and for FP values we need to introduce
11820 // FP_ROUND nodes as well.
11821 if (VT.isInteger() && !VT.isVector())
11822 ArgSize = std::max(ArgSize, MinSlotSize);
11823 bool NeedFPTrunc = false;
11824 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11825 ArgSize = 8;
11826 NeedFPTrunc = true;
11827 }
11828
11829 // Increment the pointer, VAList, to the next vaarg
11830 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11831 DAG.getConstant(ArgSize, DL, PtrVT));
11832 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
11833
11834 // Store the incremented VAList to the legalized pointer
11835 SDValue APStore =
11836 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
11837
11838 // Load the actual argument out of the pointer VAList
11839 if (NeedFPTrunc) {
11840 // Load the value as an f64.
11841 SDValue WideFP =
11842 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
11843 // Round the value down to an f32.
11844 SDValue NarrowFP =
11845 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
11846 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
11847 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
11848 // Merge the rounded value with the chain output of the load.
11849 return DAG.getMergeValues(Ops, DL);
11850 }
11851
11852 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
11853}
11854
11855SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11856 SelectionDAG &DAG) const {
11858 MFI.setFrameAddressIsTaken(true);
11859
11860 EVT VT = Op.getValueType();
11861 SDLoc DL(Op);
11862 unsigned Depth = Op.getConstantOperandVal(0);
11863 SDValue FrameAddr =
11864 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
11865 while (Depth--)
11866 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
11868
11869 if (Subtarget->isTargetILP32())
11870 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
11871 DAG.getValueType(VT));
11872
11873 return FrameAddr;
11874}
11875
11876SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11877 SelectionDAG &DAG) const {
11879
11880 EVT VT = getPointerTy(DAG.getDataLayout());
11881 SDLoc DL(Op);
11882 int FI = MFI.CreateFixedObject(4, 0, false);
11883 return DAG.getFrameIndex(FI, VT);
11884}
11885
11886#define GET_REGISTER_MATCHER
11887#include "AArch64GenAsmMatcher.inc"
11888
11889// FIXME? Maybe this could be a TableGen attribute on some registers and
11890// this table could be generated automatically from RegInfo.
11891Register AArch64TargetLowering::
11892getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11894 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11895 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11896 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
11897 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
11898 !MRI->isReservedReg(MF, Reg))
11899 Reg = 0;
11900 }
11901 if (Reg)
11902 return Reg;
11903 report_fatal_error(Twine("Invalid register name \""
11904 + StringRef(RegName) + "\"."));
11905}
11906
11907SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11908 SelectionDAG &DAG) const {
11910
11911 EVT VT = Op.getValueType();
11912 SDLoc DL(Op);
11913
11914 SDValue FrameAddr =
11915 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
11917
11918 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
11919}
11920
11921SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11922 SelectionDAG &DAG) const {
11924 MachineFrameInfo &MFI = MF.getFrameInfo();
11925 MFI.setReturnAddressIsTaken(true);
11926
11927 EVT VT = Op.getValueType();
11928 SDLoc DL(Op);
11929 unsigned Depth = Op.getConstantOperandVal(0);
11930 SDValue ReturnAddress;
11931 if (Depth) {
11932 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11934 ReturnAddress = DAG.getLoad(
11935 VT, DL, DAG.getEntryNode(),
11936 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11937 } else {
11938 // Return LR, which contains the return address. Mark it an implicit
11939 // live-in.
11940 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11941 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11942 }
11943
11944 // The XPACLRI instruction assembles to a hint-space instruction before
11945 // Armv8.3-A therefore this instruction can be safely used for any pre
11946 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11947 // that instead.
11948 SDNode *St;
11949 if (Subtarget->hasPAuth()) {
11950 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11951 } else {
11952 // XPACLRI operates on LR therefore we must move the operand accordingly.
11953 SDValue Chain =
11954 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11955 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11956 }
11957 return SDValue(St, 0);
11958}
11959
11960/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11961/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11962SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11963 SelectionDAG &DAG) const {
11964 SDValue Lo, Hi;
11965 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11966 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11967}
11968
11970 const GlobalAddressSDNode *GA) const {
11971 // Offsets are folded in the DAG combine rather than here so that we can
11972 // intelligently choose an offset based on the uses.
11973 return false;
11974}
11975
11977 bool OptForSize) const {
11978 bool IsLegal = false;
11979 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11980 // 16-bit case when target has full fp16 support.
11981 // We encode bf16 bit patterns as if they were fp16. This results in very
11982 // strange looking assembly but should populate the register with appropriate
11983 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11984 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11985 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11986 // FIXME: We should be able to handle f128 as well with a clever lowering.
11987 const APInt ImmInt = Imm.bitcastToAPInt();
11988 if (VT == MVT::f64)
11989 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11990 else if (VT == MVT::f32)
11991 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11992 else if (VT == MVT::f16 || VT == MVT::bf16)
11993 IsLegal =
11994 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11995 Imm.isPosZero();
11996
11997 // If we can not materialize in immediate field for fmov, check if the
11998 // value can be encoded as the immediate operand of a logical instruction.
11999 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12000 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12001 // generate that fmov.
12002 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12003 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12004 // however the mov+fmov sequence is always better because of the reduced
12005 // cache pressure. The timings are still the same if you consider
12006 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12007 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12010 assert(Insn.size() <= 4 &&
12011 "Should be able to build any value with at most 4 moves");
12012 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12013 IsLegal = Insn.size() <= Limit;
12014 }
12015
12016 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12017 << " imm value: "; Imm.dump(););
12018 return IsLegal;
12019}
12020
12021//===----------------------------------------------------------------------===//
12022// AArch64 Optimization Hooks
12023//===----------------------------------------------------------------------===//
12024
12025static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12026 SDValue Operand, SelectionDAG &DAG,
12027 int &ExtraSteps) {
12028 EVT VT = Operand.getValueType();
12029 if ((ST->hasNEON() &&
12030 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12031 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12032 VT == MVT::v4f32)) ||
12033 (ST->hasSVE() &&
12034 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12036 // For the reciprocal estimates, convergence is quadratic, so the number
12037 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12038 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12039 // the result for float (23 mantissa bits) is 2 and for double (52
12040 // mantissa bits) is 3.
12041 constexpr unsigned AccurateBits = 8;
12042 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12043 ExtraSteps = DesiredBits <= AccurateBits
12044 ? 0
12045 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12046 }
12047
12048 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12049 }
12050
12051 return SDValue();
12052}
12053
12054SDValue
12055AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12056 const DenormalMode &Mode) const {
12057 SDLoc DL(Op);
12058 EVT VT = Op.getValueType();
12059 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12060 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12061 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12062}
12063
12064SDValue
12065AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12066 SelectionDAG &DAG) const {
12067 return Op;
12068}
12069
12070SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12071 SelectionDAG &DAG, int Enabled,
12072 int &ExtraSteps,
12073 bool &UseOneConst,
12074 bool Reciprocal) const {
12076 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12077 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12078 DAG, ExtraSteps)) {
12079 SDLoc DL(Operand);
12080 EVT VT = Operand.getValueType();
12081
12083
12084 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12085 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12086 for (int i = ExtraSteps; i > 0; --i) {
12087 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12088 Flags);
12089 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12090 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12091 }
12092 if (!Reciprocal)
12093 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12094
12095 ExtraSteps = 0;
12096 return Estimate;
12097 }
12098
12099 return SDValue();
12100}
12101
12102SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12103 SelectionDAG &DAG, int Enabled,
12104 int &ExtraSteps) const {
12106 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12107 DAG, ExtraSteps)) {
12108 SDLoc DL(Operand);
12109 EVT VT = Operand.getValueType();
12110
12112
12113 // Newton reciprocal iteration: E * (2 - X * E)
12114 // AArch64 reciprocal iteration instruction: (2 - M * N)
12115 for (int i = ExtraSteps; i > 0; --i) {
12116 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12117 Estimate, Flags);
12118 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12119 }
12120
12121 ExtraSteps = 0;
12122 return Estimate;
12123 }
12124
12125 return SDValue();
12126}
12127
12128//===----------------------------------------------------------------------===//
12129// AArch64 Inline Assembly Support
12130//===----------------------------------------------------------------------===//
12131
12132// Table of Constraints
12133// TODO: This is the current set of constraints supported by ARM for the
12134// compiler, not all of them may make sense.
12135//
12136// r - A general register
12137// w - An FP/SIMD register of some size in the range v0-v31
12138// x - An FP/SIMD register of some size in the range v0-v15
12139// I - Constant that can be used with an ADD instruction
12140// J - Constant that can be used with a SUB instruction
12141// K - Constant that can be used with a 32-bit logical instruction
12142// L - Constant that can be used with a 64-bit logical instruction
12143// M - Constant that can be used as a 32-bit MOV immediate
12144// N - Constant that can be used as a 64-bit MOV immediate
12145// Q - A memory reference with base register and no offset
12146// S - A symbolic address
12147// Y - Floating point constant zero
12148// Z - Integer constant zero
12149//
12150// Note that general register operands will be output using their 64-bit x
12151// register name, whatever the size of the variable, unless the asm operand
12152// is prefixed by the %w modifier. Floating-point and SIMD register operands
12153// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12154// %q modifier.
12155const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12156 // At this point, we have to lower this constraint to something else, so we
12157 // lower it to an "r" or "w". However, by doing this we will force the result
12158 // to be in register, while the X constraint is much more permissive.
12159 //
12160 // Although we are correct (we are free to emit anything, without
12161 // constraints), we might break use cases that would expect us to be more
12162 // efficient and emit something else.
12163 if (!Subtarget->hasFPARMv8())
12164 return "r";
12165
12166 if (ConstraintVT.isFloatingPoint())
12167 return "w";
12168
12169 if (ConstraintVT.isVector() &&
12170 (ConstraintVT.getSizeInBits() == 64 ||
12171 ConstraintVT.getSizeInBits() == 128))
12172 return "w";
12173
12174 return "r";
12175}
12176
12178
12179// Returns a {Reg, RegisterClass} tuple if the constraint is
12180// a specific predicate register.
12181//
12182// For some constraint like "{pn3}" the default path in
12183// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12184// suitable register class for this register is "PPRorPNR", after which it
12185// determines that nxv16i1 is an appropriate type for the constraint, which is
12186// not what we want. The code here pre-empts this by matching the register
12187// explicitly.
12188static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12190 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12191 Constraint[1] != 'p')
12192 return std::nullopt;
12193
12194 Constraint = Constraint.substr(2, Constraint.size() - 3);
12195 bool IsPredicateAsCount = Constraint.starts_with("n");
12196 if (IsPredicateAsCount)
12197 Constraint = Constraint.drop_front(1);
12198
12199 unsigned V;
12200 if (Constraint.getAsInteger(10, V) || V > 31)
12201 return std::nullopt;
12202
12203 if (IsPredicateAsCount)
12204 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12205 else
12206 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12207}
12208
12209static std::optional<PredicateConstraint>
12212 .Case("Uph", PredicateConstraint::Uph)
12213 .Case("Upl", PredicateConstraint::Upl)
12214 .Case("Upa", PredicateConstraint::Upa)
12215 .Default(std::nullopt);
12216}
12217
12218static const TargetRegisterClass *
12220 if (VT != MVT::aarch64svcount &&
12221 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12222 return nullptr;
12223
12224 switch (Constraint) {
12225 case PredicateConstraint::Uph:
12226 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12227 : &AArch64::PPR_p8to15RegClass;
12228 case PredicateConstraint::Upl:
12229 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12230 : &AArch64::PPR_3bRegClass;
12231 case PredicateConstraint::Upa:
12232 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12233 : &AArch64::PPRRegClass;
12234 }
12235
12236 llvm_unreachable("Missing PredicateConstraint!");
12237}
12238
12240
12241static std::optional<ReducedGprConstraint>
12244 .Case("Uci", ReducedGprConstraint::Uci)
12245 .Case("Ucj", ReducedGprConstraint::Ucj)
12246 .Default(std::nullopt);
12247}
12248
12249static const TargetRegisterClass *
12251 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12252 return nullptr;
12253
12254 switch (Constraint) {
12255 case ReducedGprConstraint::Uci:
12256 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12257 case ReducedGprConstraint::Ucj:
12258 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12259 }
12260
12261 llvm_unreachable("Missing ReducedGprConstraint!");
12262}
12263
12264// The set of cc code supported is from
12265// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12268 .Case("{@cchi}", AArch64CC::HI)
12269 .Case("{@cccs}", AArch64CC::HS)
12270 .Case("{@cclo}", AArch64CC::LO)
12271 .Case("{@ccls}", AArch64CC::LS)
12272 .Case("{@cccc}", AArch64CC::LO)
12273 .Case("{@cceq}", AArch64CC::EQ)
12274 .Case("{@ccgt}", AArch64CC::GT)
12275 .Case("{@ccge}", AArch64CC::GE)
12276 .Case("{@cclt}", AArch64CC::LT)
12277 .Case("{@ccle}", AArch64CC::LE)
12278 .Case("{@cchs}", AArch64CC::HS)
12279 .Case("{@ccne}", AArch64CC::NE)
12280 .Case("{@ccvc}", AArch64CC::VC)
12281 .Case("{@ccpl}", AArch64CC::PL)
12282 .Case("{@ccvs}", AArch64CC::VS)
12283 .Case("{@ccmi}", AArch64CC::MI)
12285 return Cond;
12286}
12287
12288/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12289/// WZR, invert(<cond>)'.
12291 SelectionDAG &DAG) {
12292 return DAG.getNode(
12293 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
12294 DAG.getConstant(0, DL, MVT::i32),
12295 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
12296}
12297
12298// Lower @cc flag output via getSETCC.
12299SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12300 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12301 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12302 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12303 if (Cond == AArch64CC::Invalid)
12304 return SDValue();
12305 // The output variable should be a scalar integer.
12306 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12307 OpInfo.ConstraintVT.getSizeInBits() < 8)
12308 report_fatal_error("Flag output operand is of invalid type");
12309
12310 // Get NZCV register. Only update chain when copyfrom is glued.
12311 if (Glue.getNode()) {
12312 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
12313 Chain = Glue.getValue(1);
12314 } else
12315 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
12316 // Extract CC code.
12317 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12318
12320
12321 // Truncate or ZERO_EXTEND based on value types.
12322 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12323 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12324 else
12325 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12326
12327 return Result;
12328}
12329
12330/// getConstraintType - Given a constraint letter, return the type of
12331/// constraint it is for this target.
12333AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12334 if (Constraint.size() == 1) {
12335 switch (Constraint[0]) {
12336 default:
12337 break;
12338 case 'x':
12339 case 'w':
12340 case 'y':
12341 return C_RegisterClass;
12342 // An address with a single base register. Due to the way we
12343 // currently handle addresses it is the same as 'r'.
12344 case 'Q':
12345 return C_Memory;
12346 case 'I':
12347 case 'J':
12348 case 'K':
12349 case 'L':
12350 case 'M':
12351 case 'N':
12352 case 'Y':
12353 case 'Z':
12354 return C_Immediate;
12355 case 'z':
12356 case 'S': // A symbol or label reference with a constant offset
12357 return C_Other;
12358 }
12359 } else if (parsePredicateConstraint(Constraint))
12360 return C_RegisterClass;
12361 else if (parseReducedGprConstraint(Constraint))
12362 return C_RegisterClass;
12363 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12364 return C_Other;
12365 return TargetLowering::getConstraintType(Constraint);
12366}
12367
12368/// Examine constraint type and operand type and determine a weight value.
12369/// This object must already have been set up with the operand type
12370/// and the current alternative constraint selected.
12372AArch64TargetLowering::getSingleConstraintMatchWeight(
12373 AsmOperandInfo &info, const char *constraint) const {
12375 Value *CallOperandVal = info.CallOperandVal;
12376 // If we don't have a value, we can't do a match,
12377 // but allow it at the lowest weight.
12378 if (!CallOperandVal)
12379 return CW_Default;
12380 Type *type = CallOperandVal->getType();
12381 // Look at the constraint type.
12382 switch (*constraint) {
12383 default:
12385 break;
12386 case 'x':
12387 case 'w':
12388 case 'y':
12389 if (type->isFloatingPointTy() || type->isVectorTy())
12390 weight = CW_Register;
12391 break;
12392 case 'z':
12393 weight = CW_Constant;
12394 break;
12395 case 'U':
12396 if (parsePredicateConstraint(constraint) ||
12397 parseReducedGprConstraint(constraint))
12398 weight = CW_Register;
12399 break;
12400 }
12401 return weight;
12402}
12403
12404std::pair<unsigned, const TargetRegisterClass *>
12405AArch64TargetLowering::getRegForInlineAsmConstraint(
12406 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12407 if (Constraint.size() == 1) {
12408 switch (Constraint[0]) {
12409 case 'r':
12410 if (VT.isScalableVector())
12411 return std::make_pair(0U, nullptr);
12412 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12413 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
12414 if (VT.getFixedSizeInBits() == 64)
12415 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
12416 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
12417 case 'w': {
12418 if (!Subtarget->hasFPARMv8())
12419 break;
12420 if (VT.isScalableVector()) {
12421 if (VT.getVectorElementType() != MVT::i1)
12422 return std::make_pair(0U, &AArch64::ZPRRegClass);
12423 return std::make_pair(0U, nullptr);
12424 }
12425 if (VT == MVT::Other)
12426 break;
12427 uint64_t VTSize = VT.getFixedSizeInBits();
12428 if (VTSize == 16)
12429 return std::make_pair(0U, &AArch64::FPR16RegClass);
12430 if (VTSize == 32)
12431 return std::make_pair(0U, &AArch64::FPR32RegClass);
12432 if (VTSize == 64)
12433 return std::make_pair(0U, &AArch64::FPR64RegClass);
12434 if (VTSize == 128)
12435 return std::make_pair(0U, &AArch64::FPR128RegClass);
12436 break;
12437 }
12438 // The instructions that this constraint is designed for can
12439 // only take 128-bit registers so just use that regclass.
12440 case 'x':
12441 if (!Subtarget->hasFPARMv8())
12442 break;
12443 if (VT.isScalableVector())
12444 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
12445 if (VT.getSizeInBits() == 128)
12446 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
12447 break;
12448 case 'y':
12449 if (!Subtarget->hasFPARMv8())
12450 break;
12451 if (VT.isScalableVector())
12452 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
12453 break;
12454 }
12455 } else {
12456 if (const auto P = parsePredicateRegAsConstraint(Constraint))
12457 return *P;
12458 if (const auto PC = parsePredicateConstraint(Constraint))
12459 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
12460 return std::make_pair(0U, RegClass);
12461
12462 if (const auto RGC = parseReducedGprConstraint(Constraint))
12463 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
12464 return std::make_pair(0U, RegClass);
12465 }
12466 if (StringRef("{cc}").equals_insensitive(Constraint) ||
12468 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
12469
12470 if (Constraint == "{za}") {
12471 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
12472 }
12473
12474 if (Constraint == "{zt0}") {
12475 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
12476 }
12477
12478 // Use the default implementation in TargetLowering to convert the register
12479 // constraint into a member of a register class.
12480 std::pair<unsigned, const TargetRegisterClass *> Res;
12482
12483 // Not found as a standard register?
12484 if (!Res.second) {
12485 unsigned Size = Constraint.size();
12486 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
12487 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
12488 int RegNo;
12489 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
12490 if (!Failed && RegNo >= 0 && RegNo <= 31) {
12491 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12492 // By default we'll emit v0-v31 for this unless there's a modifier where
12493 // we'll emit the correct register as well.
12494 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
12495 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
12496 Res.second = &AArch64::FPR64RegClass;
12497 } else {
12498 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
12499 Res.second = &AArch64::FPR128RegClass;
12500 }
12501 }
12502 }
12503 }
12504
12505 if (Res.second && !Subtarget->hasFPARMv8() &&
12506 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
12507 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
12508 return std::make_pair(0U, nullptr);
12509
12510 return Res;
12511}
12512
12514 llvm::Type *Ty,
12515 bool AllowUnknown) const {
12516 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
12517 return EVT(MVT::i64x8);
12518
12519 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12520}
12521
12522/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12523/// vector. If it is invalid, don't add anything to Ops.
12524void AArch64TargetLowering::LowerAsmOperandForConstraint(
12525 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12526 SelectionDAG &DAG) const {
12527 SDValue Result;
12528
12529 // Currently only support length 1 constraints.
12530 if (Constraint.size() != 1)
12531 return;
12532
12533 char ConstraintLetter = Constraint[0];
12534 switch (ConstraintLetter) {
12535 default:
12536 break;
12537
12538 // This set of constraints deal with valid constants for various instructions.
12539 // Validate and return a target constant for them if we can.
12540 case 'z': {
12541 // 'z' maps to xzr or wzr so it needs an input of 0.
12542 if (!isNullConstant(Op))
12543 return;
12544
12545 if (Op.getValueType() == MVT::i64)
12546 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
12547 else
12548 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
12549 break;
12550 }
12551 case 'S':
12552 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
12553 // supported for PIC while "s" isn't, making "s" less useful. We implement
12554 // "S" but not "s".
12556 break;
12557
12558 case 'I':
12559 case 'J':
12560 case 'K':
12561 case 'L':
12562 case 'M':
12563 case 'N':
12564 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12565 if (!C)
12566 return;
12567
12568 // Grab the value and do some validation.
12569 uint64_t CVal = C->getZExtValue();
12570 switch (ConstraintLetter) {
12571 // The I constraint applies only to simple ADD or SUB immediate operands:
12572 // i.e. 0 to 4095 with optional shift by 12
12573 // The J constraint applies only to ADD or SUB immediates that would be
12574 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
12575 // instruction [or vice versa], in other words -1 to -4095 with optional
12576 // left shift by 12.
12577 case 'I':
12578 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
12579 break;
12580 return;
12581 case 'J': {
12582 uint64_t NVal = -C->getSExtValue();
12583 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
12584 CVal = C->getSExtValue();
12585 break;
12586 }
12587 return;
12588 }
12589 // The K and L constraints apply *only* to logical immediates, including
12590 // what used to be the MOVI alias for ORR (though the MOVI alias has now
12591 // been removed and MOV should be used). So these constraints have to
12592 // distinguish between bit patterns that are valid 32-bit or 64-bit
12593 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12594 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12595 // versa.
12596 case 'K':
12597 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12598 break;
12599 return;
12600 case 'L':
12601 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12602 break;
12603 return;
12604 // The M and N constraints are a superset of K and L respectively, for use
12605 // with the MOV (immediate) alias. As well as the logical immediates they
12606 // also match 32 or 64-bit immediates that can be loaded either using a
12607 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
12608 // (M) or 64-bit 0x1234000000000000 (N) etc.
12609 // As a note some of this code is liberally stolen from the asm parser.
12610 case 'M': {
12611 if (!isUInt<32>(CVal))
12612 return;
12613 if (AArch64_AM::isLogicalImmediate(CVal, 32))
12614 break;
12615 if ((CVal & 0xFFFF) == CVal)
12616 break;
12617 if ((CVal & 0xFFFF0000ULL) == CVal)
12618 break;
12619 uint64_t NCVal = ~(uint32_t)CVal;
12620 if ((NCVal & 0xFFFFULL) == NCVal)
12621 break;
12622 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12623 break;
12624 return;
12625 }
12626 case 'N': {
12627 if (AArch64_AM::isLogicalImmediate(CVal, 64))
12628 break;
12629 if ((CVal & 0xFFFFULL) == CVal)
12630 break;
12631 if ((CVal & 0xFFFF0000ULL) == CVal)
12632 break;
12633 if ((CVal & 0xFFFF00000000ULL) == CVal)
12634 break;
12635 if ((CVal & 0xFFFF000000000000ULL) == CVal)
12636 break;
12637 uint64_t NCVal = ~CVal;
12638 if ((NCVal & 0xFFFFULL) == NCVal)
12639 break;
12640 if ((NCVal & 0xFFFF0000ULL) == NCVal)
12641 break;
12642 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
12643 break;
12644 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
12645 break;
12646 return;
12647 }
12648 default:
12649 return;
12650 }
12651
12652 // All assembler immediates are 64-bit integers.
12653 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
12654 break;
12655 }
12656
12657 if (Result.getNode()) {
12658 Ops.push_back(Result);
12659 return;
12660 }
12661
12662 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12663}
12664
12665//===----------------------------------------------------------------------===//
12666// AArch64 Advanced SIMD Support
12667//===----------------------------------------------------------------------===//
12668
12669/// WidenVector - Given a value in the V64 register class, produce the
12670/// equivalent value in the V128 register class.
12672 EVT VT = V64Reg.getValueType();
12673 unsigned NarrowSize = VT.getVectorNumElements();
12674 MVT EltTy = VT.getVectorElementType().getSimpleVT();
12675 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
12676 SDLoc DL(V64Reg);
12677
12678 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
12679 V64Reg, DAG.getConstant(0, DL, MVT::i64));
12680}
12681
12682/// getExtFactor - Determine the adjustment factor for the position when
12683/// generating an "extract from vector registers" instruction.
12684static unsigned getExtFactor(SDValue &V) {
12685 EVT EltType = V.getValueType().getVectorElementType();
12686 return EltType.getSizeInBits() / 8;
12687}
12688
12689// Check if a vector is built from one vector via extracted elements of
12690// another together with an AND mask, ensuring that all elements fit
12691// within range. This can be reconstructed using AND and NEON's TBL1.
12693 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12694 SDLoc dl(Op);
12695 EVT VT = Op.getValueType();
12696 assert(!VT.isScalableVector() &&
12697 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12698
12699 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12700 // directly to TBL1.
12701 if (VT != MVT::v16i8 && VT != MVT::v8i8)
12702 return SDValue();
12703
12704 unsigned NumElts = VT.getVectorNumElements();
12705 assert((NumElts == 8 || NumElts == 16) &&
12706 "Need to have exactly 8 or 16 elements in vector.");
12707
12708 SDValue SourceVec;
12709 SDValue MaskSourceVec;
12710 SmallVector<SDValue, 16> AndMaskConstants;
12711
12712 for (unsigned i = 0; i < NumElts; ++i) {
12713 SDValue V = Op.getOperand(i);
12714 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12715 return SDValue();
12716
12717 SDValue OperandSourceVec = V.getOperand(0);
12718 if (!SourceVec)
12719 SourceVec = OperandSourceVec;
12720 else if (SourceVec != OperandSourceVec)
12721 return SDValue();
12722
12723 // This only looks at shuffles with elements that are
12724 // a) truncated by a constant AND mask extracted from a mask vector, or
12725 // b) extracted directly from a mask vector.
12726 SDValue MaskSource = V.getOperand(1);
12727 if (MaskSource.getOpcode() == ISD::AND) {
12728 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
12729 return SDValue();
12730
12731 AndMaskConstants.push_back(MaskSource.getOperand(1));
12732 MaskSource = MaskSource->getOperand(0);
12733 } else if (!AndMaskConstants.empty()) {
12734 // Either all or no operands should have an AND mask.
12735 return SDValue();
12736 }
12737
12738 // An ANY_EXTEND may be inserted between the AND and the source vector
12739 // extraction. We don't care about that, so we can just skip it.
12740 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12741 MaskSource = MaskSource.getOperand(0);
12742
12743 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12744 return SDValue();
12745
12746 SDValue MaskIdx = MaskSource.getOperand(1);
12747 if (!isa<ConstantSDNode>(MaskIdx) ||
12748 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
12749 return SDValue();
12750
12751 // We only apply this if all elements come from the same vector with the
12752 // same vector type.
12753 if (!MaskSourceVec) {
12754 MaskSourceVec = MaskSource->getOperand(0);
12755 if (MaskSourceVec.getValueType() != VT)
12756 return SDValue();
12757 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
12758 return SDValue();
12759 }
12760 }
12761
12762 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12763 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12764 // insert, we know that the index in the mask must be smaller than the number
12765 // of elements in the source, or we would have an out-of-bounds access.
12766 if (NumElts == 8)
12767 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
12768 DAG.getUNDEF(VT));
12769
12770 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12771 if (!AndMaskConstants.empty())
12772 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
12773 DAG.getBuildVector(VT, dl, AndMaskConstants));
12774
12775 return DAG.getNode(
12777 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
12778 MaskSourceVec);
12779}
12780
12781// Gather data to see if the operation can be modelled as a
12782// shuffle in combination with VEXTs.
12784 SelectionDAG &DAG) const {
12785 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12786 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12787 SDLoc dl(Op);
12788 EVT VT = Op.getValueType();
12789 assert(!VT.isScalableVector() &&
12790 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12791 unsigned NumElts = VT.getVectorNumElements();
12792
12793 struct ShuffleSourceInfo {
12794 SDValue Vec;
12795 unsigned MinElt;
12796 unsigned MaxElt;
12797
12798 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12799 // be compatible with the shuffle we intend to construct. As a result
12800 // ShuffleVec will be some sliding window into the original Vec.
12801 SDValue ShuffleVec;
12802
12803 // Code should guarantee that element i in Vec starts at element "WindowBase
12804 // + i * WindowScale in ShuffleVec".
12805 int WindowBase;
12806 int WindowScale;
12807
12808 ShuffleSourceInfo(SDValue Vec)
12809 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12810 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12811
12812 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12813 };
12814
12815 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12816 // node.
12818 for (unsigned i = 0; i < NumElts; ++i) {
12819 SDValue V = Op.getOperand(i);
12820 if (V.isUndef())
12821 continue;
12822 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12823 !isa<ConstantSDNode>(V.getOperand(1)) ||
12824 V.getOperand(0).getValueType().isScalableVector()) {
12825 LLVM_DEBUG(
12826 dbgs() << "Reshuffle failed: "
12827 "a shuffle can only come from building a vector from "
12828 "various elements of other fixed-width vectors, provided "
12829 "their indices are constant\n");
12830 return SDValue();
12831 }
12832
12833 // Add this element source to the list if it's not already there.
12834 SDValue SourceVec = V.getOperand(0);
12835 auto Source = find(Sources, SourceVec);
12836 if (Source == Sources.end())
12837 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
12838
12839 // Update the minimum and maximum lane number seen.
12840 unsigned EltNo = V.getConstantOperandVal(1);
12841 Source->MinElt = std::min(Source->MinElt, EltNo);
12842 Source->MaxElt = std::max(Source->MaxElt, EltNo);
12843 }
12844
12845 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12846 // better than moving to/from gpr registers for larger vectors.
12847 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12848 // Construct a mask for the tbl. We may need to adjust the index for types
12849 // larger than i8.
12851 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12852 for (unsigned I = 0; I < NumElts; ++I) {
12853 SDValue V = Op.getOperand(I);
12854 if (V.isUndef()) {
12855 for (unsigned OF = 0; OF < OutputFactor; OF++)
12856 Mask.push_back(-1);
12857 continue;
12858 }
12859 // Set the Mask lanes adjusted for the size of the input and output
12860 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12861 // output element, adjusted in their positions per input and output types.
12862 unsigned Lane = V.getConstantOperandVal(1);
12863 for (unsigned S = 0; S < Sources.size(); S++) {
12864 if (V.getOperand(0) == Sources[S].Vec) {
12865 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12866 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12867 for (unsigned OF = 0; OF < OutputFactor; OF++)
12868 Mask.push_back(InputBase + OF);
12869 break;
12870 }
12871 }
12872 }
12873
12874 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12875 // v16i8, and the TBLMask
12876 SmallVector<SDValue, 16> TBLOperands;
12877 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
12878 ? Intrinsic::aarch64_neon_tbl3
12879 : Intrinsic::aarch64_neon_tbl4,
12880 dl, MVT::i32));
12881 for (unsigned i = 0; i < Sources.size(); i++) {
12882 SDValue Src = Sources[i].Vec;
12883 EVT SrcVT = Src.getValueType();
12884 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
12885 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12886 "Expected a legally typed vector");
12887 if (SrcVT.is64BitVector())
12888 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
12889 DAG.getUNDEF(MVT::v8i8));
12890 TBLOperands.push_back(Src);
12891 }
12892
12894 for (unsigned i = 0; i < Mask.size(); i++)
12895 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
12896 assert((Mask.size() == 8 || Mask.size() == 16) &&
12897 "Expected a v8i8 or v16i8 Mask");
12898 TBLOperands.push_back(
12899 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
12900
12901 SDValue Shuffle =
12903 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
12904 return DAG.getBitcast(VT, Shuffle);
12905 }
12906
12907 if (Sources.size() > 2) {
12908 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12909 << "sensible when at most two source vectors are "
12910 << "involved\n");
12911 return SDValue();
12912 }
12913
12914 // Find out the smallest element size among result and two sources, and use
12915 // it as element size to build the shuffle_vector.
12916 EVT SmallestEltTy = VT.getVectorElementType();
12917 for (auto &Source : Sources) {
12918 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12919 if (SrcEltTy.bitsLT(SmallestEltTy)) {
12920 SmallestEltTy = SrcEltTy;
12921 }
12922 }
12923 unsigned ResMultiplier =
12924 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12925 uint64_t VTSize = VT.getFixedSizeInBits();
12926 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12927 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
12928
12929 // If the source vector is too wide or too narrow, we may nevertheless be able
12930 // to construct a compatible shuffle either by concatenating it with UNDEF or
12931 // extracting a suitable range of elements.
12932 for (auto &Src : Sources) {
12933 EVT SrcVT = Src.ShuffleVec.getValueType();
12934
12935 TypeSize SrcVTSize = SrcVT.getSizeInBits();
12936 if (SrcVTSize == TypeSize::getFixed(VTSize))
12937 continue;
12938
12939 // This stage of the search produces a source with the same element type as
12940 // the original, but with a total width matching the BUILD_VECTOR output.
12941 EVT EltVT = SrcVT.getVectorElementType();
12942 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12943 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
12944
12945 if (SrcVTSize.getFixedValue() < VTSize) {
12946 assert(2 * SrcVTSize == VTSize);
12947 // We can pad out the smaller vector for free, so if it's part of a
12948 // shuffle...
12949 Src.ShuffleVec =
12950 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
12951 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
12952 continue;
12953 }
12954
12955 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12956 LLVM_DEBUG(
12957 dbgs() << "Reshuffle failed: result vector too small to extract\n");
12958 return SDValue();
12959 }
12960
12961 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12962 LLVM_DEBUG(
12963 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12964 return SDValue();
12965 }
12966
12967 if (Src.MinElt >= NumSrcElts) {
12968 // The extraction can just take the second half
12969 Src.ShuffleVec =
12970 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12971 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12972 Src.WindowBase = -NumSrcElts;
12973 } else if (Src.MaxElt < NumSrcElts) {
12974 // The extraction can just take the first half
12975 Src.ShuffleVec =
12976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12977 DAG.getConstant(0, dl, MVT::i64));
12978 } else {
12979 // An actual VEXT is needed
12980 SDValue VEXTSrc1 =
12981 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12982 DAG.getConstant(0, dl, MVT::i64));
12983 SDValue VEXTSrc2 =
12984 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12985 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12986 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12987
12988 if (!SrcVT.is64BitVector()) {
12989 LLVM_DEBUG(
12990 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12991 "for SVE vectors.");
12992 return SDValue();
12993 }
12994
12995 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12996 VEXTSrc2,
12997 DAG.getConstant(Imm, dl, MVT::i32));
12998 Src.WindowBase = -Src.MinElt;
12999 }
13000 }
13001
13002 // Another possible incompatibility occurs from the vector element types. We
13003 // can fix this by bitcasting the source vectors to the same type we intend
13004 // for the shuffle.
13005 for (auto &Src : Sources) {
13006 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13007 if (SrcEltTy == SmallestEltTy)
13008 continue;
13009 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13010 if (DAG.getDataLayout().isBigEndian()) {
13011 Src.ShuffleVec =
13012 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
13013 } else {
13014 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
13015 }
13016 Src.WindowScale =
13017 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13018 Src.WindowBase *= Src.WindowScale;
13019 }
13020
13021 // Final check before we try to actually produce a shuffle.
13022 LLVM_DEBUG({
13023 for (auto Src : Sources)
13024 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13025 });
13026
13027 // The stars all align, our next step is to produce the mask for the shuffle.
13028 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13029 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13030 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13031 SDValue Entry = Op.getOperand(i);
13032 if (Entry.isUndef())
13033 continue;
13034
13035 auto Src = find(Sources, Entry.getOperand(0));
13036 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13037
13038 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13039 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13040 // segment.
13041 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13042 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13043 VT.getScalarSizeInBits());
13044 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13045
13046 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13047 // starting at the appropriate offset.
13048 int *LaneMask = &Mask[i * ResMultiplier];
13049
13050 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13051 ExtractBase += NumElts * (Src - Sources.begin());
13052 for (int j = 0; j < LanesDefined; ++j)
13053 LaneMask[j] = ExtractBase + j;
13054 }
13055
13056 // Final check before we try to produce nonsense...
13057 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13058 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13059 return SDValue();
13060 }
13061
13062 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13063 for (unsigned i = 0; i < Sources.size(); ++i)
13064 ShuffleOps[i] = Sources[i].ShuffleVec;
13065
13066 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
13067 ShuffleOps[1], Mask);
13068 SDValue V;
13069 if (DAG.getDataLayout().isBigEndian()) {
13070 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
13071 } else {
13072 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
13073 }
13074
13075 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13076 dbgs() << "Reshuffle, creating node: "; V.dump(););
13077
13078 return V;
13079}
13080
13081// check if an EXT instruction can handle the shuffle mask when the
13082// vector sources of the shuffle are the same.
13083static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13084 unsigned NumElts = VT.getVectorNumElements();
13085
13086 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13087 if (M[0] < 0)
13088 return false;
13089
13090 Imm = M[0];
13091
13092 // If this is a VEXT shuffle, the immediate value is the index of the first
13093 // element. The other shuffle indices must be the successive elements after
13094 // the first one.
13095 unsigned ExpectedElt = Imm;
13096 for (unsigned i = 1; i < NumElts; ++i) {
13097 // Increment the expected index. If it wraps around, just follow it
13098 // back to index zero and keep going.
13099 ++ExpectedElt;
13100 if (ExpectedElt == NumElts)
13101 ExpectedElt = 0;
13102
13103 if (M[i] < 0)
13104 continue; // ignore UNDEF indices
13105 if (ExpectedElt != static_cast<unsigned>(M[i]))
13106 return false;
13107 }
13108
13109 return true;
13110}
13111
13112// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13113// v4i32s. This is really a truncate, which we can construct out of (legal)
13114// concats and truncate nodes.
13116 if (V.getValueType() != MVT::v16i8)
13117 return SDValue();
13118 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13119
13120 for (unsigned X = 0; X < 4; X++) {
13121 // Check the first item in each group is an extract from lane 0 of a v4i32
13122 // or v4i16.
13123 SDValue BaseExt = V.getOperand(X * 4);
13124 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13125 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13126 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13127 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13128 BaseExt.getConstantOperandVal(1) != 0)
13129 return SDValue();
13130 SDValue Base = BaseExt.getOperand(0);
13131 // And check the other items are extracts from the same vector.
13132 for (unsigned Y = 1; Y < 4; Y++) {
13133 SDValue Ext = V.getOperand(X * 4 + Y);
13134 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13135 Ext.getOperand(0) != Base ||
13136 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13137 Ext.getConstantOperandVal(1) != Y)
13138 return SDValue();
13139 }
13140 }
13141
13142 // Turn the buildvector into a series of truncates and concates, which will
13143 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13144 // concat together to produce 2 v8i16. These are both truncated and concat
13145 // together.
13146 SDLoc DL(V);
13147 SDValue Trunc[4] = {
13148 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13149 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13150 for (SDValue &V : Trunc)
13151 if (V.getValueType() == MVT::v4i32)
13152 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13153 SDValue Concat0 =
13154 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13155 SDValue Concat1 =
13156 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13157 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13158 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13159 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13160}
13161
13162/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13163/// element width than the vector lane type. If that is the case the function
13164/// returns true and writes the value of the DUP instruction lane operand into
13165/// DupLaneOp
13166static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13167 unsigned &DupLaneOp) {
13168 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13169 "Only possible block sizes for wide DUP are: 16, 32, 64");
13170
13171 if (BlockSize <= VT.getScalarSizeInBits())
13172 return false;
13173 if (BlockSize % VT.getScalarSizeInBits() != 0)
13174 return false;
13175 if (VT.getSizeInBits() % BlockSize != 0)
13176 return false;
13177
13178 size_t SingleVecNumElements = VT.getVectorNumElements();
13179 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13180 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13181
13182 // We are looking for masks like
13183 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13184 // might be replaced by 'undefined'. BlockIndices will eventually contain
13185 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13186 // for the above examples)
13187 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13188 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13189 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13190 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13191 if (Elt < 0)
13192 continue;
13193 // For now we don't support shuffles that use the second operand
13194 if ((unsigned)Elt >= SingleVecNumElements)
13195 return false;
13196 if (BlockElts[I] < 0)
13197 BlockElts[I] = Elt;
13198 else if (BlockElts[I] != Elt)
13199 return false;
13200 }
13201
13202 // We found a candidate block (possibly with some undefs). It must be a
13203 // sequence of consecutive integers starting with a value divisible by
13204 // NumEltsPerBlock with some values possibly replaced by undef-s.
13205
13206 // Find first non-undef element
13207 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13208 assert(FirstRealEltIter != BlockElts.end() &&
13209 "Shuffle with all-undefs must have been caught by previous cases, "
13210 "e.g. isSplat()");
13211 if (FirstRealEltIter == BlockElts.end()) {
13212 DupLaneOp = 0;
13213 return true;
13214 }
13215
13216 // Index of FirstRealElt in BlockElts
13217 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13218
13219 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13220 return false;
13221 // BlockElts[0] must have the following value if it isn't undef:
13222 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13223
13224 // Check the first element
13225 if (Elt0 % NumEltsPerBlock != 0)
13226 return false;
13227 // Check that the sequence indeed consists of consecutive integers (modulo
13228 // undefs)
13229 for (size_t I = 0; I < NumEltsPerBlock; I++)
13230 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13231 return false;
13232
13233 DupLaneOp = Elt0 / NumEltsPerBlock;
13234 return true;
13235}
13236
13237// check if an EXT instruction can handle the shuffle mask when the
13238// vector sources of the shuffle are different.
13239static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13240 unsigned &Imm) {
13241 // Look for the first non-undef element.
13242 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13243
13244 // Benefit form APInt to handle overflow when calculating expected element.
13245 unsigned NumElts = VT.getVectorNumElements();
13246 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13247 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13248 /*implicitTrunc=*/true);
13249 // The following shuffle indices must be the successive elements after the
13250 // first real element.
13251 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13252 return Elt != ExpectedElt++ && Elt != -1;
13253 });
13254 if (FoundWrongElt)
13255 return false;
13256
13257 // The index of an EXT is the first element if it is not UNDEF.
13258 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13259 // value of the first element. E.g.
13260 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13261 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13262 // ExpectedElt is the last mask index plus 1.
13263 Imm = ExpectedElt.getZExtValue();
13264
13265 // There are two difference cases requiring to reverse input vectors.
13266 // For example, for vector <4 x i32> we have the following cases,
13267 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13268 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13269 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13270 // to reverse two input vectors.
13271 if (Imm < NumElts)
13272 ReverseEXT = true;
13273 else
13274 Imm -= NumElts;
13275
13276 return true;
13277}
13278
13279/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13280/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13281/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13282static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13283 unsigned NumElts = VT.getVectorNumElements();
13284 if (NumElts % 2 != 0)
13285 return false;
13286 WhichResult = (M[0] == 0 ? 0 : 1);
13287 unsigned Idx = WhichResult * NumElts / 2;
13288 for (unsigned i = 0; i != NumElts; i += 2) {
13289 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13290 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13291 return false;
13292 Idx += 1;
13293 }
13294
13295 return true;
13296}
13297
13298/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13299/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13300/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13301static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13302 unsigned Half = VT.getVectorNumElements() / 2;
13303 WhichResult = (M[0] == 0 ? 0 : 1);
13304 for (unsigned j = 0; j != 2; ++j) {
13305 unsigned Idx = WhichResult;
13306 for (unsigned i = 0; i != Half; ++i) {
13307 int MIdx = M[i + j * Half];
13308 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13309 return false;
13310 Idx += 2;
13311 }
13312 }
13313
13314 return true;
13315}
13316
13317/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13318/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13319/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13320static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13321 unsigned NumElts = VT.getVectorNumElements();
13322 if (NumElts % 2 != 0)
13323 return false;
13324 WhichResult = (M[0] == 0 ? 0 : 1);
13325 for (unsigned i = 0; i < NumElts; i += 2) {
13326 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13327 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13328 return false;
13329 }
13330 return true;
13331}
13332
13333static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13334 bool &DstIsLeft, int &Anomaly) {
13335 if (M.size() != static_cast<size_t>(NumInputElements))
13336 return false;
13337
13338 int NumLHSMatch = 0, NumRHSMatch = 0;
13339 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13340
13341 for (int i = 0; i < NumInputElements; ++i) {
13342 if (M[i] == -1) {
13343 ++NumLHSMatch;
13344 ++NumRHSMatch;
13345 continue;
13346 }
13347
13348 if (M[i] == i)
13349 ++NumLHSMatch;
13350 else
13351 LastLHSMismatch = i;
13352
13353 if (M[i] == i + NumInputElements)
13354 ++NumRHSMatch;
13355 else
13356 LastRHSMismatch = i;
13357 }
13358
13359 if (NumLHSMatch == NumInputElements - 1) {
13360 DstIsLeft = true;
13361 Anomaly = LastLHSMismatch;
13362 return true;
13363 } else if (NumRHSMatch == NumInputElements - 1) {
13364 DstIsLeft = false;
13365 Anomaly = LastRHSMismatch;
13366 return true;
13367 }
13368
13369 return false;
13370}
13371
13372static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13373 if (VT.getSizeInBits() != 128)
13374 return false;
13375
13376 unsigned NumElts = VT.getVectorNumElements();
13377
13378 for (int I = 0, E = NumElts / 2; I != E; I++) {
13379 if (Mask[I] != I)
13380 return false;
13381 }
13382
13383 int Offset = NumElts / 2;
13384 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13385 if (Mask[I] != I + SplitLHS * Offset)
13386 return false;
13387 }
13388
13389 return true;
13390}
13391
13393 SDLoc DL(Op);
13394 EVT VT = Op.getValueType();
13395 SDValue V0 = Op.getOperand(0);
13396 SDValue V1 = Op.getOperand(1);
13397 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13398
13401 return SDValue();
13402
13403 bool SplitV0 = V0.getValueSizeInBits() == 128;
13404
13405 if (!isConcatMask(Mask, VT, SplitV0))
13406 return SDValue();
13407
13408 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13409 if (SplitV0) {
13410 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
13411 DAG.getConstant(0, DL, MVT::i64));
13412 }
13413 if (V1.getValueSizeInBits() == 128) {
13414 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
13415 DAG.getConstant(0, DL, MVT::i64));
13416 }
13417 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
13418}
13419
13420/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13421/// the specified operations to build the shuffle. ID is the perfect-shuffle
13422//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13423//table entry and LHS/RHS are the immediate inputs for this stage of the
13424//shuffle.
13426 SDValue V2, unsigned PFEntry, SDValue LHS,
13427 SDValue RHS, SelectionDAG &DAG,
13428 const SDLoc &dl) {
13429 unsigned OpNum = (PFEntry >> 26) & 0x0F;
13430 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
13431 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
13432
13433 enum {
13434 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13435 OP_VREV,
13436 OP_VDUP0,
13437 OP_VDUP1,
13438 OP_VDUP2,
13439 OP_VDUP3,
13440 OP_VEXT1,
13441 OP_VEXT2,
13442 OP_VEXT3,
13443 OP_VUZPL, // VUZP, left result
13444 OP_VUZPR, // VUZP, right result
13445 OP_VZIPL, // VZIP, left result
13446 OP_VZIPR, // VZIP, right result
13447 OP_VTRNL, // VTRN, left result
13448 OP_VTRNR, // VTRN, right result
13449 OP_MOVLANE // Move lane. RHSID is the lane to move into
13450 };
13451
13452 if (OpNum == OP_COPY) {
13453 if (LHSID == (1 * 9 + 2) * 9 + 3)
13454 return LHS;
13455 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
13456 return RHS;
13457 }
13458
13459 if (OpNum == OP_MOVLANE) {
13460 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
13461 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13462 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
13463 Elt = 3 - Elt;
13464 while (Elt > 0) {
13465 ID /= 9;
13466 Elt--;
13467 }
13468 return (ID % 9 == 8) ? -1 : ID % 9;
13469 };
13470
13471 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13472 // get the lane to move from the PFID, which is always from the
13473 // original vectors (V1 or V2).
13475 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
13476 EVT VT = OpLHS.getValueType();
13477 assert(RHSID < 8 && "Expected a lane index for RHSID!");
13478 unsigned ExtLane = 0;
13479 SDValue Input;
13480
13481 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13482 // convert into a higher type.
13483 if (RHSID & 0x4) {
13484 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
13485 if (MaskElt == -1)
13486 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
13487 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13488 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
13489 Input = MaskElt < 2 ? V1 : V2;
13490 if (VT.getScalarSizeInBits() == 16) {
13491 Input = DAG.getBitcast(MVT::v2f32, Input);
13492 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
13493 } else {
13494 assert(VT.getScalarSizeInBits() == 32 &&
13495 "Expected 16 or 32 bit shuffle elemements");
13496 Input = DAG.getBitcast(MVT::v2f64, Input);
13497 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
13498 }
13499 } else {
13500 int MaskElt = getPFIDLane(ID, RHSID);
13501 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
13502 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
13503 Input = MaskElt < 4 ? V1 : V2;
13504 // Be careful about creating illegal types. Use f16 instead of i16.
13505 if (VT == MVT::v4i16) {
13506 Input = DAG.getBitcast(MVT::v4f16, Input);
13507 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
13508 }
13509 }
13512 Input, DAG.getVectorIdxConstant(ExtLane, dl));
13513 SDValue Ins =
13514 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
13515 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
13516 return DAG.getBitcast(VT, Ins);
13517 }
13518
13519 SDValue OpLHS, OpRHS;
13520 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
13521 RHS, DAG, dl);
13522 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
13523 RHS, DAG, dl);
13524 EVT VT = OpLHS.getValueType();
13525
13526 switch (OpNum) {
13527 default:
13528 llvm_unreachable("Unknown shuffle opcode!");
13529 case OP_VREV:
13530 // VREV divides the vector in half and swaps within the half.
13531 if (VT.getVectorElementType() == MVT::i32 ||
13532 VT.getVectorElementType() == MVT::f32)
13533 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
13534 // vrev <4 x i16> -> REV32
13535 if (VT.getVectorElementType() == MVT::i16 ||
13536 VT.getVectorElementType() == MVT::f16 ||
13537 VT.getVectorElementType() == MVT::bf16)
13538 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
13539 // vrev <4 x i8> -> REV16
13540 assert(VT.getVectorElementType() == MVT::i8);
13541 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
13542 case OP_VDUP0:
13543 case OP_VDUP1:
13544 case OP_VDUP2:
13545 case OP_VDUP3: {
13546 EVT EltTy = VT.getVectorElementType();
13547 unsigned Opcode;
13548 if (EltTy == MVT::i8)
13549 Opcode = AArch64ISD::DUPLANE8;
13550 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
13551 Opcode = AArch64ISD::DUPLANE16;
13552 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
13553 Opcode = AArch64ISD::DUPLANE32;
13554 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
13555 Opcode = AArch64ISD::DUPLANE64;
13556 else
13557 llvm_unreachable("Invalid vector element type?");
13558
13559 if (VT.getSizeInBits() == 64)
13560 OpLHS = WidenVector(OpLHS, DAG);
13561 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
13562 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
13563 }
13564 case OP_VEXT1:
13565 case OP_VEXT2:
13566 case OP_VEXT3: {
13567 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
13568 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
13569 DAG.getConstant(Imm, dl, MVT::i32));
13570 }
13571 case OP_VUZPL:
13572 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
13573 case OP_VUZPR:
13574 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
13575 case OP_VZIPL:
13576 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
13577 case OP_VZIPR:
13578 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
13579 case OP_VTRNL:
13580 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
13581 case OP_VTRNR:
13582 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
13583 }
13584}
13585
13587 SelectionDAG &DAG) {
13588 // Check to see if we can use the TBL instruction.
13589 SDValue V1 = Op.getOperand(0);
13590 SDValue V2 = Op.getOperand(1);
13591 SDLoc DL(Op);
13592
13593 EVT EltVT = Op.getValueType().getVectorElementType();
13594 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
13595
13596 bool Swap = false;
13597 if (V1.isUndef() || isZerosVector(V1.getNode())) {
13598 std::swap(V1, V2);
13599 Swap = true;
13600 }
13601
13602 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13603 // out of range values with 0s. We do need to make sure that any out-of-range
13604 // values are really out-of-range for a v16i8 vector.
13605 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
13606 MVT IndexVT = MVT::v8i8;
13607 unsigned IndexLen = 8;
13608 if (Op.getValueSizeInBits() == 128) {
13609 IndexVT = MVT::v16i8;
13610 IndexLen = 16;
13611 }
13612
13614 for (int Val : ShuffleMask) {
13615 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
13616 unsigned Offset = Byte + Val * BytesPerElt;
13617 if (Swap)
13618 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13619 if (IsUndefOrZero && Offset >= IndexLen)
13620 Offset = 255;
13621 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
13622 }
13623 }
13624
13625 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
13626 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
13627
13628 SDValue Shuffle;
13629 if (IsUndefOrZero) {
13630 if (IndexLen == 8)
13631 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
13632 Shuffle = DAG.getNode(
13633 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13634 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13635 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13636 } else {
13637 if (IndexLen == 8) {
13638 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
13639 Shuffle = DAG.getNode(
13640 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13641 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
13642 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13643 } else {
13644 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13645 // cannot currently represent the register constraints on the input
13646 // table registers.
13647 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13648 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13649 // IndexLen));
13650 Shuffle = DAG.getNode(
13651 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
13652 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
13653 V2Cst,
13654 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
13655 }
13656 }
13657 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
13658}
13659
13660static unsigned getDUPLANEOp(EVT EltType) {
13661 if (EltType == MVT::i8)
13662 return AArch64ISD::DUPLANE8;
13663 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
13664 return AArch64ISD::DUPLANE16;
13665 if (EltType == MVT::i32 || EltType == MVT::f32)
13666 return AArch64ISD::DUPLANE32;
13667 if (EltType == MVT::i64 || EltType == MVT::f64)
13668 return AArch64ISD::DUPLANE64;
13669
13670 llvm_unreachable("Invalid vector element type?");
13671}
13672
13673static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
13674 unsigned Opcode, SelectionDAG &DAG) {
13675 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
13676 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13677 // Match: dup (bitcast (extract_subv X, C)), LaneC
13678 if (BitCast.getOpcode() != ISD::BITCAST ||
13680 return false;
13681
13682 // The extract index must align in the destination type. That may not
13683 // happen if the bitcast is from narrow to wide type.
13684 SDValue Extract = BitCast.getOperand(0);
13685 unsigned ExtIdx = Extract.getConstantOperandVal(1);
13686 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13687 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13688 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13689 if (ExtIdxInBits % CastedEltBitWidth != 0)
13690 return false;
13691
13692 // Can't handle cases where vector size is not 128-bit
13693 if (!Extract.getOperand(0).getValueType().is128BitVector())
13694 return false;
13695
13696 // Update the lane value by offsetting with the scaled extract index.
13697 LaneC += ExtIdxInBits / CastedEltBitWidth;
13698
13699 // Determine the casted vector type of the wide vector input.
13700 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13701 // Examples:
13702 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13703 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13704 unsigned SrcVecNumElts =
13705 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
13707 SrcVecNumElts);
13708 return true;
13709 };
13710 MVT CastVT;
13711 if (getScaledOffsetDup(V, Lane, CastVT)) {
13712 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
13713 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13714 V.getOperand(0).getValueType().is128BitVector()) {
13715 // The lane is incremented by the index of the extract.
13716 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13717 Lane += V.getConstantOperandVal(1);
13718 V = V.getOperand(0);
13719 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13720 // The lane is decremented if we are splatting from the 2nd operand.
13721 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13722 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
13723 Lane -= Idx * VT.getVectorNumElements() / 2;
13724 V = WidenVector(V.getOperand(Idx), DAG);
13725 } else if (VT.getSizeInBits() == 64) {
13726 // Widen the operand to 128-bit register with undef.
13727 V = WidenVector(V, DAG);
13728 }
13729 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
13730}
13731
13732// Try to widen element type to get a new mask value for a better permutation
13733// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13734// UZP1/2, TRN1/2, REV, INS, etc.
13735// For example:
13736// shufflevector <4 x i32> %a, <4 x i32> %b,
13737// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13738// is equivalent to:
13739// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13740// Finally, we can get:
13741// mov v0.d[0], v1.d[1]
13743 SDLoc DL(Op);
13744 EVT VT = Op.getValueType();
13745 EVT ScalarVT = VT.getVectorElementType();
13746 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13747 SDValue V0 = Op.getOperand(0);
13748 SDValue V1 = Op.getOperand(1);
13749 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13750
13751 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13752 // We need to make sure the wider element type is legal. Thus, ElementSize
13753 // should be not larger than 32 bits, and i1 type should also be excluded.
13754 if (ElementSize > 32 || ElementSize == 1)
13755 return SDValue();
13756
13757 SmallVector<int, 8> NewMask;
13758 if (widenShuffleMaskElts(Mask, NewMask)) {
13759 MVT NewEltVT = VT.isFloatingPoint()
13760 ? MVT::getFloatingPointVT(ElementSize * 2)
13761 : MVT::getIntegerVT(ElementSize * 2);
13762 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13763 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13764 V0 = DAG.getBitcast(NewVT, V0);
13765 V1 = DAG.getBitcast(NewVT, V1);
13766 return DAG.getBitcast(VT,
13767 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
13768 }
13769 }
13770
13771 return SDValue();
13772}
13773
13774// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13776 ArrayRef<int> ShuffleMask,
13777 SelectionDAG &DAG) {
13778 SDValue Tbl1 = Op->getOperand(0);
13779 SDValue Tbl2 = Op->getOperand(1);
13780 SDLoc dl(Op);
13781 SDValue Tbl2ID =
13782 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
13783
13784 EVT VT = Op.getValueType();
13785 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13786 Tbl1->getOperand(0) != Tbl2ID ||
13788 Tbl2->getOperand(0) != Tbl2ID)
13789 return SDValue();
13790
13791 if (Tbl1->getValueType(0) != MVT::v16i8 ||
13792 Tbl2->getValueType(0) != MVT::v16i8)
13793 return SDValue();
13794
13795 SDValue Mask1 = Tbl1->getOperand(3);
13796 SDValue Mask2 = Tbl2->getOperand(3);
13797 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13798 for (unsigned I = 0; I < 16; I++) {
13799 if (ShuffleMask[I] < 16)
13800 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
13801 else {
13802 auto *C =
13803 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
13804 if (!C)
13805 return SDValue();
13806 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
13807 }
13808 }
13809
13810 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
13811 SDValue ID =
13812 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
13813
13814 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
13815 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
13816 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
13817}
13818
13819// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13820// but we don't have an appropriate instruction,
13821// so custom-lower it as ZIP1-with-zeros.
13822SDValue
13823AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13824 SelectionDAG &DAG) const {
13825 SDLoc dl(Op);
13826 EVT VT = Op.getValueType();
13827 SDValue SrcOp = Op.getOperand(0);
13828 EVT SrcVT = SrcOp.getValueType();
13829 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13830 "Unexpected extension factor.");
13831 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13832 // FIXME: support multi-step zipping?
13833 if (Scale != 2)
13834 return SDValue();
13835 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
13836 return DAG.getBitcast(VT,
13837 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
13838}
13839
13840SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13841 SelectionDAG &DAG) const {
13842 SDLoc dl(Op);
13843 EVT VT = Op.getValueType();
13844
13845 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
13846
13847 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13848 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13849
13850 // Convert shuffles that are directly supported on NEON to target-specific
13851 // DAG nodes, instead of keeping them as shuffles and matching them again
13852 // during code selection. This is more efficient and avoids the possibility
13853 // of inconsistencies between legalization and selection.
13854 ArrayRef<int> ShuffleMask = SVN->getMask();
13855
13856 SDValue V1 = Op.getOperand(0);
13857 SDValue V2 = Op.getOperand(1);
13858
13859 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13860 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13861 "Unexpected VECTOR_SHUFFLE mask size!");
13862
13863 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13864 return Res;
13865
13866 if (SVN->isSplat()) {
13867 int Lane = SVN->getSplatIndex();
13868 // If this is undef splat, generate it via "just" vdup, if possible.
13869 if (Lane == -1)
13870 Lane = 0;
13871
13872 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13873 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
13874 V1.getOperand(0));
13875 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13876 // constant. If so, we can just reference the lane's definition directly.
13877 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13878 !isa<ConstantSDNode>(V1.getOperand(Lane)))
13879 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
13880
13881 // Otherwise, duplicate from the lane of the input vector.
13882 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
13883 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
13884 }
13885
13886 // Check if the mask matches a DUP for a wider element
13887 for (unsigned LaneSize : {64U, 32U, 16U}) {
13888 unsigned Lane = 0;
13889 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
13890 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13891 : LaneSize == 32 ? AArch64ISD::DUPLANE32
13893 // Cast V1 to an integer vector with required lane size
13894 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
13895 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13896 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
13897 V1 = DAG.getBitcast(NewVecTy, V1);
13898 // Constuct the DUP instruction
13899 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
13900 // Cast back to the original type
13901 return DAG.getBitcast(VT, V1);
13902 }
13903 }
13904
13905 unsigned NumElts = VT.getVectorNumElements();
13906 unsigned EltSize = VT.getScalarSizeInBits();
13907 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
13908 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1);
13909 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
13910 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1);
13911 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
13912 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1);
13913
13914 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13915 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
13916 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
13917 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
13918 DAG.getConstant(8, dl, MVT::i32));
13919 }
13920
13921 bool ReverseEXT = false;
13922 unsigned Imm;
13923 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
13924 if (ReverseEXT)
13925 std::swap(V1, V2);
13926 Imm *= getExtFactor(V1);
13927 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
13928 DAG.getConstant(Imm, dl, MVT::i32));
13929 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13930 Imm *= getExtFactor(V1);
13931 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13932 DAG.getConstant(Imm, dl, MVT::i32));
13933 }
13934
13935 unsigned WhichResult;
13936 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13937 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13938 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13939 }
13940 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13941 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13942 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13943 }
13944 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13945 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13946 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13947 }
13948
13949 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13950 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13951 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13952 }
13953 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13954 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13955 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13956 }
13957 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13958 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13959 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13960 }
13961
13963 return Concat;
13964
13965 bool DstIsLeft;
13966 int Anomaly;
13967 int NumInputElements = V1.getValueType().getVectorNumElements();
13968 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13969 SDValue DstVec = DstIsLeft ? V1 : V2;
13970 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13971
13972 SDValue SrcVec = V1;
13973 int SrcLane = ShuffleMask[Anomaly];
13974 if (SrcLane >= NumInputElements) {
13975 SrcVec = V2;
13976 SrcLane -= NumElts;
13977 }
13978 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13979
13980 EVT ScalarVT = VT.getVectorElementType();
13981
13982 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13983 ScalarVT = MVT::i32;
13984
13985 return DAG.getNode(
13986 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13987 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13988 DstLaneV);
13989 }
13990
13991 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13992 return NewSD;
13993
13994 // If the shuffle is not directly supported and it has 4 elements, use
13995 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13996 if (NumElts == 4) {
13997 unsigned PFIndexes[4];
13998 for (unsigned i = 0; i != 4; ++i) {
13999 if (ShuffleMask[i] < 0)
14000 PFIndexes[i] = 8;
14001 else
14002 PFIndexes[i] = ShuffleMask[i];
14003 }
14004
14005 // Compute the index in the perfect shuffle table.
14006 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14007 PFIndexes[2] * 9 + PFIndexes[3];
14008 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14009 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14010 dl);
14011 }
14012
14013 // Check for a "select shuffle", generating a BSL to pick between lanes in
14014 // V1/V2.
14015 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14016 assert(VT.getScalarSizeInBits() <= 32 &&
14017 "Expected larger vector element sizes to be handled already");
14018 SmallVector<SDValue> MaskElts;
14019 for (int M : ShuffleMask)
14020 MaskElts.push_back(DAG.getConstant(
14021 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
14023 SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
14024 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
14025 DAG.getBitcast(IVT, V1),
14026 DAG.getBitcast(IVT, V2)));
14027 }
14028
14029 // Fall back to generating a TBL
14030 return GenerateTBL(Op, ShuffleMask, DAG);
14031}
14032
14033SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14034 SelectionDAG &DAG) const {
14035 EVT VT = Op.getValueType();
14036
14037 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14038 return LowerToScalableOp(Op, DAG);
14039
14040 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14041 "Unexpected vector type!");
14042
14043 // We can handle the constant cases during isel.
14044 if (isa<ConstantSDNode>(Op.getOperand(0)))
14045 return Op;
14046
14047 // There isn't a natural way to handle the general i1 case, so we use some
14048 // trickery with whilelo.
14049 SDLoc DL(Op);
14050 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14051 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14052 DAG.getValueType(MVT::i1));
14053 SDValue ID =
14054 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14055 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14056 if (VT == MVT::nxv1i1)
14057 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14058 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14059 Zero, SplatVal),
14060 Zero);
14061 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14062}
14063
14064SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14065 SelectionDAG &DAG) const {
14066 SDLoc DL(Op);
14067
14068 EVT VT = Op.getValueType();
14069 if (!isTypeLegal(VT) || !VT.isScalableVector())
14070 return SDValue();
14071
14072 // Current lowering only supports the SVE-ACLE types.
14074 return SDValue();
14075
14076 // The DUPQ operation is independent of element type so normalise to i64s.
14077 SDValue Idx128 = Op.getOperand(2);
14078
14079 // DUPQ can be used when idx is in range.
14080 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14081 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14082 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14083 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14084 }
14085
14086 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14087
14088 // The ACLE says this must produce the same result as:
14089 // svtbl(data, svadd_x(svptrue_b64(),
14090 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14091 // index * 2))
14092 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14093 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14094
14095 // create the vector 0,1,0,1,...
14096 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14097 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14098
14099 // create the vector idx64,idx64+1,idx64,idx64+1,...
14100 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14101 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14102 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14103
14104 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14105 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14106 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14107}
14108
14109
14110static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14111 APInt &UndefBits) {
14112 EVT VT = BVN->getValueType(0);
14113 APInt SplatBits, SplatUndef;
14114 unsigned SplatBitSize;
14115 bool HasAnyUndefs;
14116 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14117 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14118
14119 for (unsigned i = 0; i < NumSplats; ++i) {
14120 CnstBits <<= SplatBitSize;
14121 UndefBits <<= SplatBitSize;
14122 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14123 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14124 }
14125
14126 return true;
14127 }
14128
14129 return false;
14130}
14131
14132// Try 64-bit splatted SIMD immediate.
14133static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14134 const APInt &Bits) {
14135 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14136 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14137 EVT VT = Op.getValueType();
14138 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14139
14142
14143 SDLoc dl(Op);
14144 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14145 DAG.getConstant(Value, dl, MVT::i32));
14146 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14147 }
14148 }
14149
14150 return SDValue();
14151}
14152
14153// Try 32-bit splatted SIMD immediate.
14154static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14155 const APInt &Bits,
14156 const SDValue *LHS = nullptr) {
14157 EVT VT = Op.getValueType();
14158 if (VT.isFixedLengthVector() &&
14160 return SDValue();
14161
14162 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14163 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14164 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14165 bool isAdvSIMDModImm = false;
14166 uint64_t Shift;
14167
14168 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14170 Shift = 0;
14171 }
14172 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14174 Shift = 8;
14175 }
14176 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14178 Shift = 16;
14179 }
14180 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14182 Shift = 24;
14183 }
14184
14185 if (isAdvSIMDModImm) {
14186 SDLoc dl(Op);
14187 SDValue Mov;
14188
14189 if (LHS)
14190 Mov = DAG.getNode(NewOp, dl, MovTy,
14191 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14192 DAG.getConstant(Value, dl, MVT::i32),
14193 DAG.getConstant(Shift, dl, MVT::i32));
14194 else
14195 Mov = DAG.getNode(NewOp, dl, MovTy,
14196 DAG.getConstant(Value, dl, MVT::i32),
14197 DAG.getConstant(Shift, dl, MVT::i32));
14198
14199 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14200 }
14201 }
14202
14203 return SDValue();
14204}
14205
14206// Try 16-bit splatted SIMD immediate.
14207static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14208 const APInt &Bits,
14209 const SDValue *LHS = nullptr) {
14210 EVT VT = Op.getValueType();
14211 if (VT.isFixedLengthVector() &&
14213 return SDValue();
14214
14215 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14216 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14217 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14218 bool isAdvSIMDModImm = false;
14219 uint64_t Shift;
14220
14221 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14223 Shift = 0;
14224 }
14225 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14227 Shift = 8;
14228 }
14229
14230 if (isAdvSIMDModImm) {
14231 SDLoc dl(Op);
14232 SDValue Mov;
14233
14234 if (LHS)
14235 Mov = DAG.getNode(NewOp, dl, MovTy,
14236 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
14237 DAG.getConstant(Value, dl, MVT::i32),
14238 DAG.getConstant(Shift, dl, MVT::i32));
14239 else
14240 Mov = DAG.getNode(NewOp, dl, MovTy,
14241 DAG.getConstant(Value, dl, MVT::i32),
14242 DAG.getConstant(Shift, dl, MVT::i32));
14243
14244 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14245 }
14246 }
14247
14248 return SDValue();
14249}
14250
14251// Try 32-bit splatted SIMD immediate with shifted ones.
14253 SelectionDAG &DAG, const APInt &Bits) {
14254 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14255 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14256 EVT VT = Op.getValueType();
14257 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14258 bool isAdvSIMDModImm = false;
14259 uint64_t Shift;
14260
14261 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14263 Shift = 264;
14264 }
14265 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14267 Shift = 272;
14268 }
14269
14270 if (isAdvSIMDModImm) {
14271 SDLoc dl(Op);
14272 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14273 DAG.getConstant(Value, dl, MVT::i32),
14274 DAG.getConstant(Shift, dl, MVT::i32));
14275 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14276 }
14277 }
14278
14279 return SDValue();
14280}
14281
14282// Try 8-bit splatted SIMD immediate.
14283static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14284 const APInt &Bits) {
14285 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14286 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14287 EVT VT = Op.getValueType();
14288 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14289
14292
14293 SDLoc dl(Op);
14294 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14295 DAG.getConstant(Value, dl, MVT::i32));
14296 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14297 }
14298 }
14299
14300 return SDValue();
14301}
14302
14303// Try FP splatted SIMD immediate.
14304static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14305 const APInt &Bits) {
14306 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14307 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14308 EVT VT = Op.getValueType();
14309 bool isWide = (VT.getSizeInBits() == 128);
14310 MVT MovTy;
14311 bool isAdvSIMDModImm = false;
14312
14313 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14315 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14316 }
14317 else if (isWide &&
14318 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14320 MovTy = MVT::v2f64;
14321 }
14322
14323 if (isAdvSIMDModImm) {
14324 SDLoc dl(Op);
14325 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
14326 DAG.getConstant(Value, dl, MVT::i32));
14327 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
14328 }
14329 }
14330
14331 return SDValue();
14332}
14333
14334// Specialized code to quickly find if PotentialBVec is a BuildVector that
14335// consists of only the same constant int value, returned in reference arg
14336// ConstVal
14337static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14338 uint64_t &ConstVal) {
14339 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14340 if (!Bvec)
14341 return false;
14342 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
14343 if (!FirstElt)
14344 return false;
14345 EVT VT = Bvec->getValueType(0);
14346 unsigned NumElts = VT.getVectorNumElements();
14347 for (unsigned i = 1; i < NumElts; ++i)
14348 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14349 return false;
14350 ConstVal = FirstElt->getZExtValue();
14351 return true;
14352}
14353
14355 // Look through cast.
14356 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14357 N = N.getOperand(0);
14358
14359 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14360}
14361
14363 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14364
14365 // Look through cast.
14366 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14367 N = N.getOperand(0);
14368 // When reinterpreting from a type with fewer elements the "new" elements
14369 // are not active, so bail if they're likely to be used.
14370 if (N.getValueType().getVectorMinNumElements() < NumElts)
14371 return false;
14372 }
14373
14374 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14375 return true;
14376
14377 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14378 // or smaller than the implicit element type represented by N.
14379 // NOTE: A larger element count implies a smaller element type.
14380 if (N.getOpcode() == AArch64ISD::PTRUE &&
14381 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14382 return N.getValueType().getVectorMinNumElements() >= NumElts;
14383
14384 // If we're compiling for a specific vector-length, we can check if the
14385 // pattern's VL equals that of the scalable vector at runtime.
14386 if (N.getOpcode() == AArch64ISD::PTRUE) {
14387 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14388 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14389 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14390 if (MaxSVESize && MinSVESize == MaxSVESize) {
14391 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14392 unsigned PatNumElts =
14393 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14394 return PatNumElts == (NumElts * VScale);
14395 }
14396 }
14397
14398 return false;
14399}
14400
14401// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14402// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14403// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14404// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14405// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14406// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14408 EVT VT = N->getValueType(0);
14409
14410 if (!VT.isVector())
14411 return SDValue();
14412
14413 SDLoc DL(N);
14414
14415 SDValue And;
14416 SDValue Shift;
14417
14418 SDValue FirstOp = N->getOperand(0);
14419 unsigned FirstOpc = FirstOp.getOpcode();
14420 SDValue SecondOp = N->getOperand(1);
14421 unsigned SecondOpc = SecondOp.getOpcode();
14422
14423 // Is one of the operands an AND or a BICi? The AND may have been optimised to
14424 // a BICi in order to use an immediate instead of a register.
14425 // Is the other operand an shl or lshr? This will have been turned into:
14426 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14427 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14428 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
14429 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
14430 SecondOpc == AArch64ISD::SHL_PRED ||
14431 SecondOpc == AArch64ISD::SRL_PRED)) {
14432 And = FirstOp;
14433 Shift = SecondOp;
14434
14435 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
14436 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
14437 FirstOpc == AArch64ISD::SHL_PRED ||
14438 FirstOpc == AArch64ISD::SRL_PRED)) {
14439 And = SecondOp;
14440 Shift = FirstOp;
14441 } else
14442 return SDValue();
14443
14444 bool IsAnd = And.getOpcode() == ISD::AND;
14445 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
14447 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
14449
14450 // Is the shift amount constant and are all lanes active?
14451 uint64_t C2;
14452 if (ShiftHasPredOp) {
14453 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
14454 return SDValue();
14455 APInt C;
14457 return SDValue();
14458 C2 = C.getZExtValue();
14459 } else if (ConstantSDNode *C2node =
14460 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
14461 C2 = C2node->getZExtValue();
14462 else
14463 return SDValue();
14464
14465 APInt C1AsAPInt;
14466 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14467 if (IsAnd) {
14468 // Is the and mask vector all constant?
14469 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
14470 return SDValue();
14471 } else {
14472 // Reconstruct the corresponding AND immediate from the two BICi immediates.
14473 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
14474 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
14475 assert(C1nodeImm && C1nodeShift);
14476 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14477 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
14478 }
14479
14480 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14481 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14482 // how much one can shift elements of a particular size?
14483 if (C2 > ElemSizeInBits)
14484 return SDValue();
14485
14486 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
14487 : APInt::getLowBitsSet(ElemSizeInBits, C2);
14488 if (C1AsAPInt != RequiredC1)
14489 return SDValue();
14490
14491 SDValue X = And.getOperand(0);
14492 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
14493 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
14494 : Shift.getOperand(1);
14495
14496 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14497 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
14498
14499 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
14500 LLVM_DEBUG(N->dump(&DAG));
14501 LLVM_DEBUG(dbgs() << "into: \n");
14502 LLVM_DEBUG(ResultSLI->dump(&DAG));
14503
14504 ++NumShiftInserts;
14505 return ResultSLI;
14506}
14507
14508SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14509 SelectionDAG &DAG) const {
14510 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14511 !Subtarget->isNeonAvailable()))
14512 return LowerToScalableOp(Op, DAG);
14513
14514 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14515 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
14516 return Res;
14517
14518 EVT VT = Op.getValueType();
14519 if (VT.isScalableVector())
14520 return Op;
14521
14522 SDValue LHS = Op.getOperand(0);
14523 BuildVectorSDNode *BVN =
14524 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
14525 if (!BVN) {
14526 // OR commutes, so try swapping the operands.
14527 LHS = Op.getOperand(1);
14528 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
14529 }
14530 if (!BVN)
14531 return Op;
14532
14533 APInt DefBits(VT.getSizeInBits(), 0);
14534 APInt UndefBits(VT.getSizeInBits(), 0);
14535 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14536 SDValue NewOp;
14537
14538 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14539 DefBits, &LHS)) ||
14540 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14541 DefBits, &LHS)))
14542 return NewOp;
14543
14544 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
14545 UndefBits, &LHS)) ||
14546 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
14547 UndefBits, &LHS)))
14548 return NewOp;
14549 }
14550
14551 // We can always fall back to a non-immediate OR.
14552 return Op;
14553}
14554
14555// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14556// be truncated to fit element width.
14558 SelectionDAG &DAG) {
14559 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14560 SDLoc dl(Op);
14561 EVT VT = Op.getValueType();
14562 EVT EltTy= VT.getVectorElementType();
14563
14564 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
14565 return Op;
14566
14568 for (SDValue Lane : Op->ops()) {
14569 // For integer vectors, type legalization would have promoted the
14570 // operands already. Otherwise, if Op is a floating-point splat
14571 // (with operands cast to integers), then the only possibilities
14572 // are constants and UNDEFs.
14573 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
14574 Lane = DAG.getConstant(
14575 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
14576 dl, MVT::i32);
14577 } else if (Lane.getNode()->isUndef()) {
14578 Lane = DAG.getUNDEF(MVT::i32);
14579 } else {
14580 assert(Lane.getValueType() == MVT::i32 &&
14581 "Unexpected BUILD_VECTOR operand type");
14582 }
14583 Ops.push_back(Lane);
14584 }
14585 return DAG.getBuildVector(VT, dl, Ops);
14586}
14587
14589 const AArch64Subtarget *ST) {
14590 EVT VT = Op.getValueType();
14591 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
14592 "Expected a legal NEON vector");
14593
14594 APInt DefBits(VT.getSizeInBits(), 0);
14595 APInt UndefBits(VT.getSizeInBits(), 0);
14596 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14597 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
14598 auto TryMOVIWithBits = [&](APInt DefBits) {
14599 SDValue NewOp;
14600 if ((NewOp =
14601 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
14602 (NewOp =
14603 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14604 (NewOp =
14605 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
14606 (NewOp =
14607 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
14608 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
14609 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
14610 return NewOp;
14611
14612 APInt NotDefBits = ~DefBits;
14613 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
14614 NotDefBits)) ||
14616 NotDefBits)) ||
14617 (NewOp =
14618 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
14619 return NewOp;
14620 return SDValue();
14621 };
14622 if (SDValue R = TryMOVIWithBits(DefBits))
14623 return R;
14624 if (SDValue R = TryMOVIWithBits(UndefBits))
14625 return R;
14626
14627 // See if a fneg of the constant can be materialized with a MOVI, etc
14628 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14629 // FNegate each sub-element of the constant
14630 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
14631 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
14632 .zext(VT.getSizeInBits());
14633 APInt NegBits(VT.getSizeInBits(), 0);
14634 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14635 for (unsigned i = 0; i < NumElts; i++)
14636 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
14637 NegBits = DefBits ^ NegBits;
14638
14639 // Try to create the new constants with MOVI, and if so generate a fneg
14640 // for it.
14641 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
14642 SDLoc DL(Op);
14643 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
14644 return DAG.getNode(
14646 DAG.getNode(ISD::FNEG, DL, VFVT,
14647 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
14648 }
14649 return SDValue();
14650 };
14651 SDValue R;
14652 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
14653 (R = TryWithFNeg(DefBits, MVT::f64)) ||
14654 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
14655 return R;
14656 }
14657
14658 return SDValue();
14659}
14660
14661SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14662 SDValue Op, SelectionDAG &DAG) const {
14663 EVT VT = Op.getValueType();
14664 SDLoc DL(Op);
14665 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14666 auto *BVN = cast<BuildVectorSDNode>(Op);
14667
14668 if (auto SeqInfo = BVN->isConstantSequence()) {
14669 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
14670 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
14671 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
14672 return convertFromScalableVector(DAG, VT, Seq);
14673 }
14674
14675 unsigned NumElems = VT.getVectorNumElements();
14676 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
14677 NumElems <= 1 || BVN->isConstant())
14678 return SDValue();
14679
14680 auto IsExtractElt = [](SDValue Op) {
14681 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14682 };
14683
14684 // For integer types that are not already in vectors limit to at most four
14685 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14686 if (VT.getScalarType().isInteger() &&
14687 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
14688 return SDValue();
14689
14690 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14691 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
14692 SmallVector<SDValue, 16> Intermediates = map_to_vector<16>(
14693 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
14694 return Op.isUndef() ? Undef
14695 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
14696 ContainerVT, Undef, Op, ZeroI64);
14697 });
14698
14699 ElementCount ZipEC = ContainerVT.getVectorElementCount();
14700 while (Intermediates.size() > 1) {
14701 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
14702
14703 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
14704 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
14705 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
14706 Intermediates[I / 2] =
14707 Op1.isUndef() ? Op0
14708 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
14709 }
14710
14711 Intermediates.resize(Intermediates.size() / 2);
14712 ZipEC = ZipEC.divideCoefficientBy(2);
14713 }
14714
14715 assert(Intermediates.size() == 1);
14716 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
14717 return convertFromScalableVector(DAG, VT, Vec);
14718}
14719
14720SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14721 SelectionDAG &DAG) const {
14722 EVT VT = Op.getValueType();
14723
14724 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14725 cast<BuildVectorSDNode>(Op)->isConstantSequence();
14726 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14727 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14728
14729 // Try to build a simple constant vector.
14730 Op = NormalizeBuildVector(Op, DAG);
14731 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14732 // abort.
14733 if (Op.getOpcode() != ISD::BUILD_VECTOR)
14734 return SDValue();
14735
14736 // Certain vector constants, used to express things like logical NOT and
14737 // arithmetic NEG, are passed through unmodified. This allows special
14738 // patterns for these operations to match, which will lower these constants
14739 // to whatever is proven necessary.
14740 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
14741 if (BVN->isConstant()) {
14742 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14743 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14744 APInt Val(BitSize,
14745 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
14746 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
14747 return Op;
14748 }
14749 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14750 if (Const->isZero() && !Const->isNegative())
14751 return Op;
14752 }
14753
14754 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
14755 return V;
14756
14757 // Scan through the operands to find some interesting properties we can
14758 // exploit:
14759 // 1) If only one value is used, we can use a DUP, or
14760 // 2) if only the low element is not undef, we can just insert that, or
14761 // 3) if only one constant value is used (w/ some non-constant lanes),
14762 // we can splat the constant value into the whole vector then fill
14763 // in the non-constant lanes.
14764 // 4) FIXME: If different constant values are used, but we can intelligently
14765 // select the values we'll be overwriting for the non-constant
14766 // lanes such that we can directly materialize the vector
14767 // some other way (MOVI, e.g.), we can be sneaky.
14768 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14769 SDLoc dl(Op);
14770 unsigned NumElts = VT.getVectorNumElements();
14771 bool isOnlyLowElement = true;
14772 bool usesOnlyOneValue = true;
14773 bool usesOnlyOneConstantValue = true;
14774 bool isConstant = true;
14775 bool AllLanesExtractElt = true;
14776 unsigned NumConstantLanes = 0;
14777 unsigned NumDifferentLanes = 0;
14778 unsigned NumUndefLanes = 0;
14779 SDValue Value;
14780 SDValue ConstantValue;
14781 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14782 unsigned ConsecutiveValCount = 0;
14783 SDValue PrevVal;
14784 for (unsigned i = 0; i < NumElts; ++i) {
14785 SDValue V = Op.getOperand(i);
14786 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14787 AllLanesExtractElt = false;
14788 if (V.isUndef()) {
14789 ++NumUndefLanes;
14790 continue;
14791 }
14792 if (i > 0)
14793 isOnlyLowElement = false;
14794 if (!isIntOrFPConstant(V))
14795 isConstant = false;
14796
14797 if (isIntOrFPConstant(V)) {
14798 ++NumConstantLanes;
14799 if (!ConstantValue.getNode())
14800 ConstantValue = V;
14801 else if (ConstantValue != V)
14802 usesOnlyOneConstantValue = false;
14803 }
14804
14805 if (!Value.getNode())
14806 Value = V;
14807 else if (V != Value) {
14808 usesOnlyOneValue = false;
14809 ++NumDifferentLanes;
14810 }
14811
14812 if (PrevVal != V) {
14813 ConsecutiveValCount = 0;
14814 PrevVal = V;
14815 }
14816
14817 // Keep different values and its last consecutive count. For example,
14818 //
14819 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14820 // t24, t24, t24, t24, t24, t24, t24, t24
14821 // t23 = consecutive count 8
14822 // t24 = consecutive count 8
14823 // ------------------------------------------------------------------
14824 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14825 // t24, t24, t24, t24, t24, t24, t24, t24
14826 // t23 = consecutive count 5
14827 // t24 = consecutive count 9
14828 DifferentValueMap[V] = ++ConsecutiveValCount;
14829 }
14830
14831 if (!Value.getNode()) {
14832 LLVM_DEBUG(
14833 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14834 return DAG.getUNDEF(VT);
14835 }
14836
14837 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14838 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14839 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14840 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
14841 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14842 "SCALAR_TO_VECTOR node\n");
14843 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
14844 }
14845
14846 if (AllLanesExtractElt) {
14847 SDNode *Vector = nullptr;
14848 bool Even = false;
14849 bool Odd = false;
14850 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14851 // the Odd pattern <1,3,5,...>.
14852 for (unsigned i = 0; i < NumElts; ++i) {
14853 SDValue V = Op.getOperand(i);
14854 const SDNode *N = V.getNode();
14855 if (!isa<ConstantSDNode>(N->getOperand(1))) {
14856 Even = false;
14857 Odd = false;
14858 break;
14859 }
14860 SDValue N0 = N->getOperand(0);
14861
14862 // All elements are extracted from the same vector.
14863 if (!Vector) {
14864 Vector = N0.getNode();
14865 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14866 // BUILD_VECTOR.
14867 if (VT.getVectorElementType() !=
14869 break;
14870 } else if (Vector != N0.getNode()) {
14871 Odd = false;
14872 Even = false;
14873 break;
14874 }
14875
14876 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14877 // indices <1,3,5,...>.
14878 uint64_t Val = N->getConstantOperandVal(1);
14879 if (Val == 2 * i) {
14880 Even = true;
14881 continue;
14882 }
14883 if (Val - 1 == 2 * i) {
14884 Odd = true;
14885 continue;
14886 }
14887
14888 // Something does not match: abort.
14889 Odd = false;
14890 Even = false;
14891 break;
14892 }
14893 if (Even || Odd) {
14894 SDValue LHS =
14896 DAG.getConstant(0, dl, MVT::i64));
14897 SDValue RHS =
14899 DAG.getConstant(NumElts, dl, MVT::i64));
14900
14901 if (Even && !Odd)
14902 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
14903 if (Odd && !Even)
14904 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
14905 }
14906 }
14907
14908 // Use DUP for non-constant splats. For f32 constant splats, reduce to
14909 // i32 and try again.
14910 if (usesOnlyOneValue) {
14911 if (!isConstant) {
14912 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14913 Value.getValueType() != VT) {
14914 LLVM_DEBUG(
14915 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14916 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
14917 }
14918
14919 // This is actually a DUPLANExx operation, which keeps everything vectory.
14920
14921 SDValue Lane = Value.getOperand(1);
14922 Value = Value.getOperand(0);
14923 if (Value.getValueSizeInBits() == 64) {
14924 LLVM_DEBUG(
14925 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14926 "widening it\n");
14927 Value = WidenVector(Value, DAG);
14928 }
14929
14930 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
14931 return DAG.getNode(Opcode, dl, VT, Value, Lane);
14932 }
14933
14936 EVT EltTy = VT.getVectorElementType();
14937 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14938 EltTy == MVT::f64) && "Unsupported floating-point vector type");
14939 LLVM_DEBUG(
14940 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14941 "BITCASTS, and try again\n");
14942 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
14943 for (unsigned i = 0; i < NumElts; ++i)
14944 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
14945 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
14946 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
14947 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14948 Val.dump(););
14949 Val = LowerBUILD_VECTOR(Val, DAG);
14950 if (Val.getNode())
14951 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
14952 }
14953 }
14954
14955 // If we need to insert a small number of different non-constant elements and
14956 // the vector width is sufficiently large, prefer using DUP with the common
14957 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14958 // skip the constant lane handling below.
14959 bool PreferDUPAndInsert =
14960 !isConstant && NumDifferentLanes >= 1 &&
14961 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14962 NumDifferentLanes >= NumConstantLanes;
14963
14964 // If there was only one constant value used and for more than one lane,
14965 // start by splatting that value, then replace the non-constant lanes. This
14966 // is better than the default, which will perform a separate initialization
14967 // for each lane.
14968 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
14969 // Firstly, try to materialize the splat constant.
14970 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
14971 unsigned BitSize = VT.getScalarSizeInBits();
14972 APInt ConstantValueAPInt(1, 0);
14973 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
14974 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
14975 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
14976 !ConstantValueAPInt.isAllOnes()) {
14977 Val = ConstantBuildVector(Val, DAG, Subtarget);
14978 if (!Val)
14979 // Otherwise, materialize the constant and splat it.
14980 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
14981 }
14982
14983 // Now insert the non-constant lanes.
14984 for (unsigned i = 0; i < NumElts; ++i) {
14985 SDValue V = Op.getOperand(i);
14986 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14987 if (!isIntOrFPConstant(V))
14988 // Note that type legalization likely mucked about with the VT of the
14989 // source operand, so we may have to convert it here before inserting.
14990 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
14991 }
14992 return Val;
14993 }
14994
14995 // This will generate a load from the constant pool.
14996 if (isConstant) {
14997 LLVM_DEBUG(
14998 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14999 "expansion\n");
15000 return SDValue();
15001 }
15002
15003 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15004 // v4i32s. This is really a truncate, which we can construct out of (legal)
15005 // concats and truncate nodes.
15007 return M;
15008
15009 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15010 if (NumElts >= 4) {
15011 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15012 return Shuffle;
15013
15014 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15015 return Shuffle;
15016 }
15017
15018 if (PreferDUPAndInsert) {
15019 // First, build a constant vector with the common element.
15020 SmallVector<SDValue, 8> Ops(NumElts, Value);
15021 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
15022 // Next, insert the elements that do not match the common value.
15023 for (unsigned I = 0; I < NumElts; ++I)
15024 if (Op.getOperand(I) != Value)
15025 NewVector =
15026 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
15027 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
15028
15029 return NewVector;
15030 }
15031
15032 // If vector consists of two different values, try to generate two DUPs and
15033 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15034 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15036 // Check the consecutive count of the value is the half number of vector
15037 // elements. In this case, we can use CONCAT_VECTORS. For example,
15038 //
15039 // canUseVECTOR_CONCAT = true;
15040 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15041 // t24, t24, t24, t24, t24, t24, t24, t24
15042 //
15043 // canUseVECTOR_CONCAT = false;
15044 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15045 // t24, t24, t24, t24, t24, t24, t24, t24
15046 bool canUseVECTOR_CONCAT = true;
15047 for (auto Pair : DifferentValueMap) {
15048 // Check different values have same length which is NumElts / 2.
15049 if (Pair.second != NumElts / 2)
15050 canUseVECTOR_CONCAT = false;
15051 Vals.push_back(Pair.first);
15052 }
15053
15054 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15055 // CONCAT_VECTORs. For example,
15056 //
15057 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15058 // t24, t24, t24, t24, t24, t24, t24, t24
15059 // ==>
15060 // t26: v8i8 = AArch64ISD::DUP t23
15061 // t28: v8i8 = AArch64ISD::DUP t24
15062 // t29: v16i8 = concat_vectors t26, t28
15063 if (canUseVECTOR_CONCAT) {
15064 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15065 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15066 SubVT.getVectorNumElements() >= 2) {
15067 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15068 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15069 SDValue DUP1 =
15070 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
15071 SDValue DUP2 =
15072 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
15074 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
15075 return CONCAT_VECTORS;
15076 }
15077 }
15078
15079 // Let's try to generate VECTOR_SHUFFLE. For example,
15080 //
15081 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15082 // ==>
15083 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15084 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15085 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15086 if (NumElts >= 8) {
15087 SmallVector<int, 16> MaskVec;
15088 // Build mask for VECTOR_SHUFLLE.
15089 SDValue FirstLaneVal = Op.getOperand(0);
15090 for (unsigned i = 0; i < NumElts; ++i) {
15091 SDValue Val = Op.getOperand(i);
15092 if (FirstLaneVal == Val)
15093 MaskVec.push_back(i);
15094 else
15095 MaskVec.push_back(i + NumElts);
15096 }
15097
15098 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15099 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15100 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
15101 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
15103 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
15104 return VECTOR_SHUFFLE;
15105 }
15106 }
15107
15108 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15109 // know the default expansion would otherwise fall back on something even
15110 // worse. For a vector with one or two non-undef values, that's
15111 // scalar_to_vector for the elements followed by a shuffle (provided the
15112 // shuffle is valid for the target) and materialization element by element
15113 // on the stack followed by a load for everything else.
15114 if (!isConstant && !usesOnlyOneValue) {
15115 LLVM_DEBUG(
15116 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15117 "of INSERT_VECTOR_ELT\n");
15118
15119 SDValue Vec = DAG.getUNDEF(VT);
15120 SDValue Op0 = Op.getOperand(0);
15121 unsigned i = 0;
15122
15123 // Use SCALAR_TO_VECTOR for lane zero to
15124 // a) Avoid a RMW dependency on the full vector register, and
15125 // b) Allow the register coalescer to fold away the copy if the
15126 // value is already in an S or D register, and we're forced to emit an
15127 // INSERT_SUBREG that we can't fold anywhere.
15128 //
15129 // We also allow types like i8 and i16 which are illegal scalar but legal
15130 // vector element types. After type-legalization the inserted value is
15131 // extended (i32) and it is safe to cast them to the vector type by ignoring
15132 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15133 if (!Op0.isUndef()) {
15134 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15135 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
15136 ++i;
15137 }
15138 LLVM_DEBUG({
15139 if (i < NumElts)
15140 dbgs() << "Creating nodes for the other vector elements:\n";
15141 });
15142 for (; i < NumElts; ++i) {
15143 SDValue V = Op.getOperand(i);
15144 if (V.isUndef())
15145 continue;
15146 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
15147 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
15148 }
15149 return Vec;
15150 }
15151
15152 LLVM_DEBUG(
15153 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15154 "better alternative\n");
15155 return SDValue();
15156}
15157
15158SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15159 SelectionDAG &DAG) const {
15160 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15161 !Subtarget->isNeonAvailable()))
15162 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15163
15164 assert(Op.getValueType().isScalableVector() &&
15165 isTypeLegal(Op.getValueType()) &&
15166 "Expected legal scalable vector type!");
15167
15168 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15169 unsigned NumOperands = Op->getNumOperands();
15170 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15171 "Unexpected number of operands in CONCAT_VECTORS");
15172
15173 if (NumOperands == 2)
15174 return Op;
15175
15176 // Concat each pair of subvectors and pack into the lower half of the array.
15177 SmallVector<SDValue> ConcatOps(Op->ops());
15178 while (ConcatOps.size() > 1) {
15179 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15180 SDValue V1 = ConcatOps[I];
15181 SDValue V2 = ConcatOps[I + 1];
15182 EVT SubVT = V1.getValueType();
15183 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15184 ConcatOps[I / 2] =
15185 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15186 }
15187 ConcatOps.resize(ConcatOps.size() / 2);
15188 }
15189 return ConcatOps[0];
15190 }
15191
15192 return SDValue();
15193}
15194
15195SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15196 SelectionDAG &DAG) const {
15197 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15198
15199 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15200 !Subtarget->isNeonAvailable()))
15201 return LowerFixedLengthInsertVectorElt(Op, DAG);
15202
15203 EVT VT = Op.getOperand(0).getValueType();
15204
15205 if (VT.getScalarType() == MVT::i1) {
15206 EVT VectorVT = getPromotedVTForPredicate(VT);
15207 SDLoc DL(Op);
15208 SDValue ExtendedVector =
15209 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15210 SDValue ExtendedValue =
15211 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15212 VectorVT.getScalarType().getSizeInBits() < 32
15213 ? MVT::i32
15214 : VectorVT.getScalarType());
15215 ExtendedVector =
15216 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15217 ExtendedValue, Op.getOperand(2));
15218 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15219 }
15220
15221 // Check for non-constant or out of range lane.
15222 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15223 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15224 return SDValue();
15225
15226 return Op;
15227}
15228
15229SDValue
15230AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15231 SelectionDAG &DAG) const {
15232 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15233 EVT VT = Op.getOperand(0).getValueType();
15234
15235 if (VT.getScalarType() == MVT::i1) {
15236 // We can't directly extract from an SVE predicate; extend it first.
15237 // (This isn't the only possible lowering, but it's straightforward.)
15238 EVT VectorVT = getPromotedVTForPredicate(VT);
15239 SDLoc DL(Op);
15240 SDValue Extend =
15241 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15242 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15243 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15244 Extend, Op.getOperand(1));
15245 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15246 }
15247
15248 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15249 return LowerFixedLengthExtractVectorElt(Op, DAG);
15250
15251 // Check for non-constant or out of range lane.
15252 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15253 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15254 return SDValue();
15255
15256 // Insertion/extraction are legal for V128 types.
15257 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15258 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15259 VT == MVT::v8f16 || VT == MVT::v8bf16)
15260 return Op;
15261
15262 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15263 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15264 VT != MVT::v4bf16)
15265 return SDValue();
15266
15267 // For V64 types, we perform extraction by expanding the value
15268 // to a V128 type and perform the extraction on that.
15269 SDLoc DL(Op);
15270 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15271 EVT WideTy = WideVec.getValueType();
15272
15273 EVT ExtrTy = WideTy.getVectorElementType();
15274 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15275 ExtrTy = MVT::i32;
15276
15277 // For extractions, we just return the result directly.
15278 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15279 Op.getOperand(1));
15280}
15281
15282SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15283 SelectionDAG &DAG) const {
15284 EVT VT = Op.getValueType();
15286 "Only cases that extract a fixed length vector are supported!");
15287 EVT InVT = Op.getOperand(0).getValueType();
15288
15289 // If we don't have legal types yet, do nothing
15290 if (!isTypeLegal(InVT))
15291 return SDValue();
15292
15293 if (InVT.is128BitVector()) {
15294 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15295 unsigned Idx = Op.getConstantOperandVal(1);
15296
15297 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15298 if (Idx == 0)
15299 return Op;
15300
15301 // If this is extracting the upper 64-bits of a 128-bit vector, we match
15302 // that directly.
15303 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
15304 return Op;
15305 }
15306
15307 if (InVT.isScalableVector() ||
15308 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
15309 SDLoc DL(Op);
15310 SDValue Vec = Op.getOperand(0);
15311 SDValue Idx = Op.getOperand(1);
15312
15314 if (PackedVT != InVT) {
15315 // Pack input into the bottom part of an SVE register and try again.
15316 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
15317 DAG.getUNDEF(PackedVT), Vec,
15318 DAG.getVectorIdxConstant(0, DL));
15319 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
15320 }
15321
15322 // This will get matched by custom code during ISelDAGToDAG.
15323 if (isNullConstant(Idx))
15324 return Op;
15325
15326 assert(InVT.isScalableVector() && "Unexpected vector type!");
15327 // Move requested subvector to the start of the vector and try again.
15328 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
15329 return convertFromScalableVector(DAG, VT, Splice);
15330 }
15331
15332 return SDValue();
15333}
15334
15335SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15336 SelectionDAG &DAG) const {
15337 assert(Op.getValueType().isScalableVector() &&
15338 "Only expect to lower inserts into scalable vectors!");
15339
15340 EVT InVT = Op.getOperand(1).getValueType();
15341 unsigned Idx = Op.getConstantOperandVal(2);
15342
15343 SDValue Vec0 = Op.getOperand(0);
15344 SDValue Vec1 = Op.getOperand(1);
15345 SDLoc DL(Op);
15346 EVT VT = Op.getValueType();
15347
15348 if (InVT.isScalableVector()) {
15349 if (!isTypeLegal(VT))
15350 return SDValue();
15351
15352 // Break down insert_subvector into simpler parts.
15353 if (VT.getVectorElementType() == MVT::i1) {
15354 unsigned NumElts = VT.getVectorMinNumElements();
15355 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15356
15357 SDValue Lo, Hi;
15358 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15359 DAG.getVectorIdxConstant(0, DL));
15360 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
15361 DAG.getVectorIdxConstant(NumElts / 2, DL));
15362 if (Idx < (NumElts / 2))
15363 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
15365 else
15366 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
15367 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
15368
15369 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15370 }
15371
15372 // We can select these directly.
15373 if (isTypeLegal(InVT) && Vec0.isUndef())
15374 return Op;
15375
15376 // Ensure the subvector is half the size of the main vector.
15377 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
15378 return SDValue();
15379
15380 // Here narrow and wide refers to the vector element types. After "casting"
15381 // both vectors must have the same bit length and so because the subvector
15382 // has fewer elements, those elements need to be bigger.
15385
15386 // NOP cast operands to the largest legal vector of the same element count.
15387 if (VT.isFloatingPoint()) {
15388 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
15389 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
15390 } else {
15391 // Legal integer vectors are already their largest so Vec0 is fine as is.
15392 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
15393 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
15394 }
15395
15396 // To replace the top/bottom half of vector V with vector SubV we widen the
15397 // preserved half of V, concatenate this to SubV (the order depending on the
15398 // half being replaced) and then narrow the result.
15399 SDValue Narrow;
15400 if (Idx == 0) {
15401 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
15402 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
15403 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
15404 } else {
15406 "Invalid subvector index!");
15407 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
15408 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
15409 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
15410 }
15411
15412 return getSVESafeBitCast(VT, Narrow, DAG);
15413 }
15414
15415 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
15416 // This will be matched by custom code during ISelDAGToDAG.
15417 if (Vec0.isUndef())
15418 return Op;
15419
15420 std::optional<unsigned> PredPattern =
15422 auto PredTy = VT.changeVectorElementType(MVT::i1);
15423 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
15424 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
15425 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
15426 }
15427
15428 return SDValue();
15429}
15430
15431static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15432 if (Op.getOpcode() != AArch64ISD::DUP &&
15433 Op.getOpcode() != ISD::SPLAT_VECTOR &&
15434 Op.getOpcode() != ISD::BUILD_VECTOR)
15435 return false;
15436
15437 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15438 !isAllConstantBuildVector(Op, SplatVal))
15439 return false;
15440
15441 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15442 !isa<ConstantSDNode>(Op->getOperand(0)))
15443 return false;
15444
15445 SplatVal = Op->getConstantOperandVal(0);
15446 if (Op.getValueType().getVectorElementType() != MVT::i64)
15447 SplatVal = (int32_t)SplatVal;
15448
15449 Negated = false;
15450 if (isPowerOf2_64(SplatVal))
15451 return true;
15452
15453 Negated = true;
15454 if (isPowerOf2_64(-SplatVal)) {
15455 SplatVal = -SplatVal;
15456 return true;
15457 }
15458
15459 return false;
15460}
15461
15462SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15463 EVT VT = Op.getValueType();
15464 SDLoc dl(Op);
15465
15466 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
15467 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15468
15469 assert(VT.isScalableVector() && "Expected a scalable vector.");
15470
15471 bool Signed = Op.getOpcode() == ISD::SDIV;
15472 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15473
15474 bool Negated;
15475 uint64_t SplatVal;
15476 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
15477 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
15478 SDValue Res =
15479 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
15480 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
15481 if (Negated)
15482 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
15483
15484 return Res;
15485 }
15486
15487 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
15488 return LowerToPredicatedOp(Op, DAG, PredOpcode);
15489
15490 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15491 // operations, and truncate the result.
15492 EVT WidenedVT;
15493 if (VT == MVT::nxv16i8)
15494 WidenedVT = MVT::nxv8i16;
15495 else if (VT == MVT::nxv8i16)
15496 WidenedVT = MVT::nxv4i32;
15497 else
15498 llvm_unreachable("Unexpected Custom DIV operation");
15499
15500 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15501 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15502 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
15503 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
15504 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
15505 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
15506 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
15507 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
15508 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultLo);
15509 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, dl, VT, ResultHi);
15510 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLoCast, ResultHiCast);
15511}
15512
15513bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15514 EVT VT, unsigned DefinedValues) const {
15515 if (!Subtarget->isNeonAvailable())
15516 return false;
15518}
15519
15521 // Currently no fixed length shuffles that require SVE are legal.
15522 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15523 return false;
15524
15525 if (VT.getVectorNumElements() == 4 &&
15526 (VT.is128BitVector() || VT.is64BitVector())) {
15527 unsigned Cost = getPerfectShuffleCost(M);
15528 if (Cost <= 1)
15529 return true;
15530 }
15531
15532 bool DummyBool;
15533 int DummyInt;
15534 unsigned DummyUnsigned;
15535
15536 unsigned EltSize = VT.getScalarSizeInBits();
15537 unsigned NumElts = VT.getVectorNumElements();
15538 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
15539 isREVMask(M, EltSize, NumElts, 64) ||
15540 isREVMask(M, EltSize, NumElts, 32) ||
15541 isREVMask(M, EltSize, NumElts, 16) ||
15542 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
15543 isTRNMask(M, NumElts, DummyUnsigned) ||
15544 isUZPMask(M, NumElts, DummyUnsigned) ||
15545 isZIPMask(M, NumElts, DummyUnsigned) ||
15546 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
15547 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
15548 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
15549 isINSMask(M, NumElts, DummyBool, DummyInt) ||
15550 isConcatMask(M, VT, VT.getSizeInBits() == 128));
15551}
15552
15554 EVT VT) const {
15555 // Just delegate to the generic legality, clear masks aren't special.
15556 return isShuffleMaskLegal(M, VT);
15557}
15558
15559/// getVShiftImm - Check if this is a valid build_vector for the immediate
15560/// operand of a vector shift operation, where all the elements of the
15561/// build_vector must have the same constant integer value.
15562static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15563 // Ignore bit_converts.
15564 while (Op.getOpcode() == ISD::BITCAST)
15565 Op = Op.getOperand(0);
15566 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
15567 APInt SplatBits, SplatUndef;
15568 unsigned SplatBitSize;
15569 bool HasAnyUndefs;
15570 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
15571 HasAnyUndefs, ElementBits) ||
15572 SplatBitSize > ElementBits)
15573 return false;
15574 Cnt = SplatBits.getSExtValue();
15575 return true;
15576}
15577
15578/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15579/// operand of a vector shift left operation. That value must be in the range:
15580/// 0 <= Value < ElementBits for a left shift; or
15581/// 0 <= Value <= ElementBits for a long left shift.
15582static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15583 assert(VT.isVector() && "vector shift count is not a vector type");
15584 int64_t ElementBits = VT.getScalarSizeInBits();
15585 if (!getVShiftImm(Op, ElementBits, Cnt))
15586 return false;
15587 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
15588}
15589
15590/// isVShiftRImm - Check if this is a valid build_vector for the immediate
15591/// operand of a vector shift right operation. The value must be in the range:
15592/// 1 <= Value <= ElementBits for a right shift; or
15593static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15594 assert(VT.isVector() && "vector shift count is not a vector type");
15595 int64_t ElementBits = VT.getScalarSizeInBits();
15596 if (!getVShiftImm(Op, ElementBits, Cnt))
15597 return false;
15598 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
15599}
15600
15601SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15602 SelectionDAG &DAG) const {
15603 EVT VT = Op.getValueType();
15604
15605 if (VT.getScalarType() == MVT::i1) {
15606 // Lower i1 truncate to `(x & 1) != 0`.
15607 SDLoc dl(Op);
15608 EVT OpVT = Op.getOperand(0).getValueType();
15609 SDValue Zero = DAG.getConstant(0, dl, OpVT);
15610 SDValue One = DAG.getConstant(1, dl, OpVT);
15611 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
15612 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
15613 }
15614
15615 if (!VT.isVector() || VT.isScalableVector())
15616 return SDValue();
15617
15618 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15619 !Subtarget->isNeonAvailable()))
15620 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15621
15622 return SDValue();
15623}
15624
15625// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15626// possibly a truncated type, it tells how many bits of the value are to be
15627// used.
15629 SelectionDAG &DAG,
15630 unsigned &ShiftValue,
15631 SDValue &RShOperand) {
15632 if (Shift->getOpcode() != ISD::SRL)
15633 return false;
15634
15635 EVT VT = Shift.getValueType();
15636 assert(VT.isScalableVT());
15637
15638 auto ShiftOp1 =
15639 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
15640 if (!ShiftOp1)
15641 return false;
15642
15643 ShiftValue = ShiftOp1->getZExtValue();
15644 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
15645 return false;
15646
15647 SDValue Add = Shift->getOperand(0);
15648 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
15649 return false;
15650
15652 "ResVT must be truncated or same type as the shift.");
15653 // Check if an overflow can lead to incorrect results.
15654 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15655 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
15656 return false;
15657
15658 auto AddOp1 =
15659 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
15660 if (!AddOp1)
15661 return false;
15662 uint64_t AddValue = AddOp1->getZExtValue();
15663 if (AddValue != 1ULL << (ShiftValue - 1))
15664 return false;
15665
15666 RShOperand = Add->getOperand(0);
15667 return true;
15668}
15669
15670SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15671 SelectionDAG &DAG) const {
15672 EVT VT = Op.getValueType();
15673 SDLoc DL(Op);
15674 int64_t Cnt;
15675
15676 if (!Op.getOperand(1).getValueType().isVector())
15677 return Op;
15678 unsigned EltSize = VT.getScalarSizeInBits();
15679
15680 switch (Op.getOpcode()) {
15681 case ISD::SHL:
15682 if (VT.isScalableVector() ||
15684 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
15685
15686 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
15687 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
15688 DAG.getConstant(Cnt, DL, MVT::i32));
15689 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
15690 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
15691 MVT::i32),
15692 Op.getOperand(0), Op.getOperand(1));
15693 case ISD::SRA:
15694 case ISD::SRL:
15695 if (VT.isScalableVector() &&
15696 (Subtarget->hasSVE2() ||
15697 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
15698 SDValue RShOperand;
15699 unsigned ShiftValue;
15700 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
15701 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
15702 getPredicateForVector(DAG, DL, VT), RShOperand,
15703 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
15704 }
15705
15706 if (VT.isScalableVector() ||
15707 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
15708 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15710 return LowerToPredicatedOp(Op, DAG, Opc);
15711 }
15712
15713 // Right shift immediate
15714 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
15715 unsigned Opc =
15716 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15717 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
15718 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
15719 }
15720
15721 // Right shift register. Note, there is not a shift right register
15722 // instruction, but the shift left register instruction takes a signed
15723 // value, where negative numbers specify a right shift.
15724 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15725 : Intrinsic::aarch64_neon_ushl;
15726 // negate the shift amount
15727 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
15728 Op.getOperand(1));
15729 SDValue NegShiftLeft =
15731 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
15732 NegShift);
15733 return NegShiftLeft;
15734 }
15735
15736 llvm_unreachable("unexpected shift opcode");
15737}
15738
15740 AArch64CC::CondCode CC, bool NoNans, EVT VT,
15741 const SDLoc &dl, SelectionDAG &DAG) {
15742 EVT SrcVT = LHS.getValueType();
15743 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15744 "function only supposed to emit natural comparisons");
15745
15746 APInt SplatValue;
15747 APInt SplatUndef;
15748 unsigned SplatBitSize = 0;
15749 bool HasAnyUndefs;
15750
15751 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15752 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
15753 SplatBitSize, HasAnyUndefs);
15754
15755 bool IsZero = IsCnst && SplatValue == 0;
15756 bool IsOne =
15757 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
15758 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
15759
15760 if (SrcVT.getVectorElementType().isFloatingPoint()) {
15761 switch (CC) {
15762 default:
15763 return SDValue();
15764 case AArch64CC::NE: {
15765 SDValue Fcmeq;
15766 if (IsZero)
15767 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15768 else
15769 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15770 return DAG.getNOT(dl, Fcmeq, VT);
15771 }
15772 case AArch64CC::EQ:
15773 if (IsZero)
15774 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
15775 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15776 case AArch64CC::GE:
15777 if (IsZero)
15778 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
15779 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15780 case AArch64CC::GT:
15781 if (IsZero)
15782 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
15783 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15784 case AArch64CC::LE:
15785 if (!NoNans)
15786 return SDValue();
15787 // If we ignore NaNs then we can use to the LS implementation.
15788 [[fallthrough]];
15789 case AArch64CC::LS:
15790 if (IsZero)
15791 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
15792 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15793 case AArch64CC::LT:
15794 if (!NoNans)
15795 return SDValue();
15796 // If we ignore NaNs then we can use to the MI implementation.
15797 [[fallthrough]];
15798 case AArch64CC::MI:
15799 if (IsZero)
15800 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
15801 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15802 }
15803 }
15804
15805 switch (CC) {
15806 default:
15807 return SDValue();
15808 case AArch64CC::NE: {
15809 SDValue Cmeq;
15810 if (IsZero)
15811 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15812 else
15813 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15814 return DAG.getNOT(dl, Cmeq, VT);
15815 }
15816 case AArch64CC::EQ:
15817 if (IsZero)
15818 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15819 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15820 case AArch64CC::GE:
15821 if (IsZero)
15822 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15823 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
15824 case AArch64CC::GT:
15825 if (IsZero)
15826 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
15827 if (IsMinusOne)
15828 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15829 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
15830 case AArch64CC::LE:
15831 if (IsZero)
15832 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15833 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
15834 case AArch64CC::LS:
15835 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
15836 case AArch64CC::LO:
15837 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
15838 case AArch64CC::LT:
15839 if (IsZero)
15840 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
15841 if (IsOne)
15842 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15843 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
15844 case AArch64CC::HI:
15845 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
15846 case AArch64CC::HS:
15847 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
15848 }
15849}
15850
15851SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15852 SelectionDAG &DAG) const {
15853 if (Op.getValueType().isScalableVector())
15854 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
15855
15856 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15857 !Subtarget->isNeonAvailable()))
15858 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15859
15860 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15861 SDValue LHS = Op.getOperand(0);
15862 SDValue RHS = Op.getOperand(1);
15863 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15864 SDLoc dl(Op);
15865
15866 if (LHS.getValueType().getVectorElementType().isInteger()) {
15867 assert(LHS.getValueType() == RHS.getValueType());
15869 SDValue Cmp =
15870 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
15871 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15872 }
15873
15874 // Lower isnan(x) | isnan(never-nan) to x != x.
15875 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15876 if (CC == ISD::SETUO || CC == ISD::SETO) {
15877 bool OneNaN = false;
15878 if (LHS == RHS) {
15879 OneNaN = true;
15880 } else if (DAG.isKnownNeverNaN(RHS)) {
15881 OneNaN = true;
15882 RHS = LHS;
15883 } else if (DAG.isKnownNeverNaN(LHS)) {
15884 OneNaN = true;
15885 LHS = RHS;
15886 }
15887 if (OneNaN) {
15889 }
15890 }
15891
15892 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15893
15894 // Make v4f16 (only) fcmp operations utilise vector instructions
15895 // v8f16 support will be a litle more complicated
15896 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15897 LHS.getValueType().getVectorElementType() == MVT::bf16) {
15898 if (LHS.getValueType().getVectorNumElements() == 4) {
15899 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
15900 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
15901 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
15902 DAG.ReplaceAllUsesWith(Op, NewSetcc);
15903 CmpVT = MVT::v4i32;
15904 } else
15905 return SDValue();
15906 }
15907
15908 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15909 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15910 LHS.getValueType().getVectorElementType() != MVT::f128);
15911
15912 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15913 // clean. Some of them require two branches to implement.
15914 AArch64CC::CondCode CC1, CC2;
15915 bool ShouldInvert;
15916 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
15917
15918 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15919 SDValue Cmp =
15920 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
15921 if (!Cmp.getNode())
15922 return SDValue();
15923
15924 if (CC2 != AArch64CC::AL) {
15925 SDValue Cmp2 =
15926 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
15927 if (!Cmp2.getNode())
15928 return SDValue();
15929
15930 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
15931 }
15932
15933 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15934
15935 if (ShouldInvert)
15936 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
15937
15938 return Cmp;
15939}
15940
15941static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15942 SelectionDAG &DAG) {
15943 SDValue VecOp = ScalarOp.getOperand(0);
15944 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
15945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
15946 DAG.getConstant(0, DL, MVT::i64));
15947}
15948
15949static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15950 SDLoc DL, SelectionDAG &DAG) {
15951 unsigned ScalarOpcode;
15952 switch (Opcode) {
15953 case ISD::VECREDUCE_AND:
15954 ScalarOpcode = ISD::AND;
15955 break;
15956 case ISD::VECREDUCE_OR:
15957 ScalarOpcode = ISD::OR;
15958 break;
15959 case ISD::VECREDUCE_XOR:
15960 ScalarOpcode = ISD::XOR;
15961 break;
15962 default:
15963 llvm_unreachable("Expected bitwise vector reduction");
15964 return SDValue();
15965 }
15966
15967 EVT VecVT = Vec.getValueType();
15968 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15969 "Expected power-of-2 length vector");
15970
15971 EVT ElemVT = VecVT.getVectorElementType();
15972
15973 SDValue Result;
15974 unsigned NumElems = VecVT.getVectorNumElements();
15975
15976 // Special case for boolean reductions
15977 if (ElemVT == MVT::i1) {
15978 // Split large vectors into smaller ones
15979 if (NumElems > 16) {
15980 SDValue Lo, Hi;
15981 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15982 EVT HalfVT = Lo.getValueType();
15983 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
15984 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
15985 }
15986
15987 // Results of setcc operations get widened to 128 bits if their input
15988 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
15989 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
15990 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
15991 // size leads to the best codegen, since e.g. setcc results might need to be
15992 // truncated otherwise.
15993 unsigned ExtendedWidth = 64;
15994 if (Vec.getOpcode() == ISD::SETCC &&
15995 Vec.getOperand(0).getValueSizeInBits() >= 128) {
15996 ExtendedWidth = 128;
15997 }
15998 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
15999
16000 // any_ext doesn't work with umin/umax, so only use it for uadd.
16001 unsigned ExtendOp =
16002 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16003 SDValue Extended = DAG.getNode(
16004 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16005 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16006 // in that case we bitcast the sign extended values from v2i64 to v4i32
16007 // before reduction for optimal code generation.
16008 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16009 NumElems == 2 && ExtendedWidth == 128) {
16010 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16011 ExtendedVT = MVT::i32;
16012 }
16013 switch (ScalarOpcode) {
16014 case ISD::AND:
16015 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16016 break;
16017 case ISD::OR:
16018 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16019 break;
16020 case ISD::XOR:
16021 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16022 break;
16023 default:
16024 llvm_unreachable("Unexpected Opcode");
16025 }
16026
16027 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16028 } else {
16029 // Iteratively split the vector in half and combine using the bitwise
16030 // operation until it fits in a 64 bit register.
16031 while (VecVT.getSizeInBits() > 64) {
16032 SDValue Lo, Hi;
16033 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16034 VecVT = Lo.getValueType();
16035 NumElems = VecVT.getVectorNumElements();
16036 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16037 }
16038
16039 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16040
16041 // Do the remaining work on a scalar since it allows the code generator to
16042 // combine the shift and bitwise operation into one instruction and since
16043 // integer instructions can have higher throughput than vector instructions.
16044 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16045
16046 // Iteratively combine the lower and upper halves of the scalar using the
16047 // bitwise operation, halving the relevant region of the scalar in each
16048 // iteration, until the relevant region is just one element of the original
16049 // vector.
16050 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16051 SDValue ShiftAmount =
16052 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16053 SDValue Shifted =
16054 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16055 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16056 }
16057
16058 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16059 }
16060
16061 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16062}
16063
16064SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16065 SelectionDAG &DAG) const {
16066 SDValue Src = Op.getOperand(0);
16067
16068 // Try to lower fixed length reductions to SVE.
16069 EVT SrcVT = Src.getValueType();
16070 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16071 Op.getOpcode() == ISD::VECREDUCE_AND ||
16072 Op.getOpcode() == ISD::VECREDUCE_OR ||
16073 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16074 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16075 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16076 SrcVT.getVectorElementType() == MVT::i64);
16077 if (SrcVT.isScalableVector() ||
16079 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16080
16081 if (SrcVT.getVectorElementType() == MVT::i1)
16082 return LowerPredReductionToSVE(Op, DAG);
16083
16084 switch (Op.getOpcode()) {
16085 case ISD::VECREDUCE_ADD:
16086 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16087 case ISD::VECREDUCE_AND:
16088 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16089 case ISD::VECREDUCE_OR:
16090 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16092 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16094 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16096 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16098 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16099 case ISD::VECREDUCE_XOR:
16100 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16102 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16104 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16106 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16108 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16110 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16111 default:
16112 llvm_unreachable("Unhandled fixed length reduction");
16113 }
16114 }
16115
16116 // Lower NEON reductions.
16117 SDLoc dl(Op);
16118 switch (Op.getOpcode()) {
16119 case ISD::VECREDUCE_AND:
16120 case ISD::VECREDUCE_OR:
16121 case ISD::VECREDUCE_XOR:
16122 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16123 Op.getValueType(), dl, DAG);
16124 case ISD::VECREDUCE_ADD:
16125 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
16127 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
16129 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
16131 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
16133 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
16134 default:
16135 llvm_unreachable("Unhandled reduction");
16136 }
16137}
16138
16139SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16140 SelectionDAG &DAG) const {
16141 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16142 // No point replacing if we don't have the relevant instruction/libcall anyway
16143 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16144 return SDValue();
16145
16146 // LSE has an atomic load-clear instruction, but not a load-and.
16147 SDLoc dl(Op);
16148 MVT VT = Op.getSimpleValueType();
16149 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16150 SDValue RHS = Op.getOperand(2);
16151 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16152 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getAllOnesConstant(dl, VT), RHS);
16153 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
16154 Op.getOperand(0), Op.getOperand(1), RHS,
16155 AN->getMemOperand());
16156}
16157
16158SDValue
16159AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16160 SelectionDAG &DAG) const {
16161
16162 SDLoc dl(Op);
16163 // Get the inputs.
16164 SDNode *Node = Op.getNode();
16165 SDValue Chain = Op.getOperand(0);
16166 SDValue Size = Op.getOperand(1);
16168 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16169 EVT VT = Node->getValueType(0);
16170
16172 "no-stack-arg-probe")) {
16173 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16174 Chain = SP.getValue(1);
16175 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16176 if (Align)
16177 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16178 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16179 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16180 SDValue Ops[2] = {SP, Chain};
16181 return DAG.getMergeValues(Ops, dl);
16182 }
16183
16184 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
16185
16186 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16188 PtrVT, 0);
16189
16190 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16191 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16192 if (Subtarget->hasCustomCallingConv())
16193 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16194
16195 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
16196 DAG.getConstant(4, dl, MVT::i64));
16197 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
16198 Chain =
16199 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
16200 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16201 DAG.getRegisterMask(Mask), Chain.getValue(1));
16202 // To match the actual intent better, we should read the output from X15 here
16203 // again (instead of potentially spilling it to the stack), but rereading Size
16204 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16205 // here.
16206
16207 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
16208 DAG.getConstant(4, dl, MVT::i64));
16209
16210 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16211 Chain = SP.getValue(1);
16212 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16213 if (Align)
16214 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16215 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16216 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
16217
16218 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
16219
16220 SDValue Ops[2] = {SP, Chain};
16221 return DAG.getMergeValues(Ops, dl);
16222}
16223
16224SDValue
16225AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16226 SelectionDAG &DAG) const {
16227 // Get the inputs.
16228 SDNode *Node = Op.getNode();
16229 SDValue Chain = Op.getOperand(0);
16230 SDValue Size = Op.getOperand(1);
16231
16233 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16234 SDLoc dl(Op);
16235 EVT VT = Node->getValueType(0);
16236
16237 // Construct the new SP value in a GPR.
16238 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
16239 Chain = SP.getValue(1);
16240 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
16241 if (Align)
16242 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16243 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
16244
16245 // Set the real SP to the new value with a probing loop.
16246 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
16247 SDValue Ops[2] = {SP, Chain};
16248 return DAG.getMergeValues(Ops, dl);
16249}
16250
16251SDValue
16252AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16253 SelectionDAG &DAG) const {
16255
16256 if (Subtarget->isTargetWindows())
16257 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16258 else if (hasInlineStackProbe(MF))
16259 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16260 else
16261 return SDValue();
16262}
16263
16264SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16265 unsigned NewOp) const {
16266 if (Subtarget->hasSVE2())
16267 return LowerToPredicatedOp(Op, DAG, NewOp);
16268
16269 // Default to expand.
16270 return SDValue();
16271}
16272
16273SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16274 SelectionDAG &DAG) const {
16275 EVT VT = Op.getValueType();
16276 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16277
16278 SDLoc DL(Op);
16279 APInt MulImm = Op.getConstantOperandAPInt(0);
16280 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16281 VT);
16282}
16283
16284/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16285template <unsigned NumVecs>
16286static bool
16290 // Retrieve EC from first vector argument.
16291 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16293#ifndef NDEBUG
16294 // Check the assumption that all input vectors are the same type.
16295 for (unsigned I = 0; I < NumVecs; ++I)
16296 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16297 "Invalid type.");
16298#endif
16299 // memVT is `NumVecs * VT`.
16301 EC * NumVecs);
16302 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16303 Info.offset = 0;
16304 Info.align.reset();
16306 return true;
16307}
16308
16309/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16310/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16311/// specified in the intrinsic calls.
16313 const CallInst &I,
16314 MachineFunction &MF,
16315 unsigned Intrinsic) const {
16316 auto &DL = I.getDataLayout();
16317 switch (Intrinsic) {
16318 case Intrinsic::aarch64_sve_st2:
16319 return setInfoSVEStN<2>(*this, DL, Info, I);
16320 case Intrinsic::aarch64_sve_st3:
16321 return setInfoSVEStN<3>(*this, DL, Info, I);
16322 case Intrinsic::aarch64_sve_st4:
16323 return setInfoSVEStN<4>(*this, DL, Info, I);
16324 case Intrinsic::aarch64_neon_ld2:
16325 case Intrinsic::aarch64_neon_ld3:
16326 case Intrinsic::aarch64_neon_ld4:
16327 case Intrinsic::aarch64_neon_ld1x2:
16328 case Intrinsic::aarch64_neon_ld1x3:
16329 case Intrinsic::aarch64_neon_ld1x4: {
16331 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16332 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16333 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16334 Info.offset = 0;
16335 Info.align.reset();
16336 // volatile loads with NEON intrinsics not supported
16338 return true;
16339 }
16340 case Intrinsic::aarch64_neon_ld2lane:
16341 case Intrinsic::aarch64_neon_ld3lane:
16342 case Intrinsic::aarch64_neon_ld4lane:
16343 case Intrinsic::aarch64_neon_ld2r:
16344 case Intrinsic::aarch64_neon_ld3r:
16345 case Intrinsic::aarch64_neon_ld4r: {
16347 // ldx return struct with the same vec type
16348 Type *RetTy = I.getType();
16349 auto *StructTy = cast<StructType>(RetTy);
16350 unsigned NumElts = StructTy->getNumElements();
16351 Type *VecTy = StructTy->getElementType(0);
16352 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16353 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16354 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16355 Info.offset = 0;
16356 Info.align.reset();
16357 // volatile loads with NEON intrinsics not supported
16359 return true;
16360 }
16361 case Intrinsic::aarch64_neon_st2:
16362 case Intrinsic::aarch64_neon_st3:
16363 case Intrinsic::aarch64_neon_st4:
16364 case Intrinsic::aarch64_neon_st1x2:
16365 case Intrinsic::aarch64_neon_st1x3:
16366 case Intrinsic::aarch64_neon_st1x4: {
16368 unsigned NumElts = 0;
16369 for (const Value *Arg : I.args()) {
16370 Type *ArgTy = Arg->getType();
16371 if (!ArgTy->isVectorTy())
16372 break;
16373 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16374 }
16375 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16376 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16377 Info.offset = 0;
16378 Info.align.reset();
16379 // volatile stores with NEON intrinsics not supported
16381 return true;
16382 }
16383 case Intrinsic::aarch64_neon_st2lane:
16384 case Intrinsic::aarch64_neon_st3lane:
16385 case Intrinsic::aarch64_neon_st4lane: {
16387 unsigned NumElts = 0;
16388 // all the vector type is same
16389 Type *VecTy = I.getArgOperand(0)->getType();
16390 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16391
16392 for (const Value *Arg : I.args()) {
16393 Type *ArgTy = Arg->getType();
16394 if (!ArgTy->isVectorTy())
16395 break;
16396 NumElts += 1;
16397 }
16398
16399 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16400 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16401 Info.offset = 0;
16402 Info.align.reset();
16403 // volatile stores with NEON intrinsics not supported
16405 return true;
16406 }
16407 case Intrinsic::aarch64_ldaxr:
16408 case Intrinsic::aarch64_ldxr: {
16409 Type *ValTy = I.getParamElementType(0);
16411 Info.memVT = MVT::getVT(ValTy);
16412 Info.ptrVal = I.getArgOperand(0);
16413 Info.offset = 0;
16414 Info.align = DL.getABITypeAlign(ValTy);
16416 return true;
16417 }
16418 case Intrinsic::aarch64_stlxr:
16419 case Intrinsic::aarch64_stxr: {
16420 Type *ValTy = I.getParamElementType(1);
16422 Info.memVT = MVT::getVT(ValTy);
16423 Info.ptrVal = I.getArgOperand(1);
16424 Info.offset = 0;
16425 Info.align = DL.getABITypeAlign(ValTy);
16427 return true;
16428 }
16429 case Intrinsic::aarch64_ldaxp:
16430 case Intrinsic::aarch64_ldxp:
16432 Info.memVT = MVT::i128;
16433 Info.ptrVal = I.getArgOperand(0);
16434 Info.offset = 0;
16435 Info.align = Align(16);
16437 return true;
16438 case Intrinsic::aarch64_stlxp:
16439 case Intrinsic::aarch64_stxp:
16441 Info.memVT = MVT::i128;
16442 Info.ptrVal = I.getArgOperand(2);
16443 Info.offset = 0;
16444 Info.align = Align(16);
16446 return true;
16447 case Intrinsic::aarch64_sve_ldnt1: {
16448 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
16450 Info.memVT = MVT::getVT(I.getType());
16451 Info.ptrVal = I.getArgOperand(1);
16452 Info.offset = 0;
16453 Info.align = DL.getABITypeAlign(ElTy);
16455 return true;
16456 }
16457 case Intrinsic::aarch64_sve_stnt1: {
16458 Type *ElTy =
16459 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
16461 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
16462 Info.ptrVal = I.getArgOperand(2);
16463 Info.offset = 0;
16464 Info.align = DL.getABITypeAlign(ElTy);
16466 return true;
16467 }
16468 case Intrinsic::aarch64_mops_memset_tag: {
16469 Value *Dst = I.getArgOperand(0);
16470 Value *Val = I.getArgOperand(1);
16472 Info.memVT = MVT::getVT(Val->getType());
16473 Info.ptrVal = Dst;
16474 Info.offset = 0;
16475 Info.align = I.getParamAlign(0).valueOrOne();
16477 // The size of the memory being operated on is unknown at this point
16479 return true;
16480 }
16481 default:
16482 break;
16483 }
16484
16485 return false;
16486}
16487
16489 ISD::LoadExtType ExtTy,
16490 EVT NewVT) const {
16491 // TODO: This may be worth removing. Check regression tests for diffs.
16492 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
16493 return false;
16494
16495 // If we're reducing the load width in order to avoid having to use an extra
16496 // instruction to do extension then it's probably a good idea.
16497 if (ExtTy != ISD::NON_EXTLOAD)
16498 return true;
16499 // Don't reduce load width if it would prevent us from combining a shift into
16500 // the offset.
16501 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
16502 assert(Mem);
16503 const SDValue &Base = Mem->getBasePtr();
16504 if (Base.getOpcode() == ISD::ADD &&
16505 Base.getOperand(1).getOpcode() == ISD::SHL &&
16506 Base.getOperand(1).hasOneUse() &&
16507 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
16508 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
16509 if (Mem->getMemoryVT().isScalableVector())
16510 return false;
16511 // The shift can be combined if it matches the size of the value being
16512 // loaded (and so reducing the width would make it not match).
16513 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
16514 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
16515 if (ShiftAmount == Log2_32(LoadBytes))
16516 return false;
16517 }
16518 // We have no reason to disallow reducing the load width, so allow it.
16519 return true;
16520}
16521
16522// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16524 EVT VT = Extend.getValueType();
16525 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
16526 SDValue Extract = Extend.getOperand(0);
16527 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16528 Extract = Extract.getOperand(0);
16529 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16530 EVT VecVT = Extract.getOperand(0).getValueType();
16531 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
16532 return false;
16533 }
16534 }
16535 return true;
16536}
16537
16538// Truncations from 64-bit GPR to 32-bit GPR is free.
16540 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16541 return false;
16542 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16543 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16544 return NumBits1 > NumBits2;
16545}
16547 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16548 return false;
16549 uint64_t NumBits1 = VT1.getFixedSizeInBits();
16550 uint64_t NumBits2 = VT2.getFixedSizeInBits();
16551 return NumBits1 > NumBits2;
16552}
16553
16554/// Check if it is profitable to hoist instruction in then/else to if.
16555/// Not profitable if I and it's user can form a FMA instruction
16556/// because we prefer FMSUB/FMADD.
16558 if (I->getOpcode() != Instruction::FMul)
16559 return true;
16560
16561 if (!I->hasOneUse())
16562 return true;
16563
16564 Instruction *User = I->user_back();
16565
16566 if (!(User->getOpcode() == Instruction::FSub ||
16567 User->getOpcode() == Instruction::FAdd))
16568 return true;
16569
16571 const Function *F = I->getFunction();
16572 const DataLayout &DL = F->getDataLayout();
16573 Type *Ty = User->getOperand(0)->getType();
16574
16575 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
16577 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16578 Options.UnsafeFPMath));
16579}
16580
16581// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16582// 64-bit GPR.
16584 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16585 return false;
16586 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16587 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16588 return NumBits1 == 32 && NumBits2 == 64;
16589}
16591 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
16592 return false;
16593 unsigned NumBits1 = VT1.getSizeInBits();
16594 unsigned NumBits2 = VT2.getSizeInBits();
16595 return NumBits1 == 32 && NumBits2 == 64;
16596}
16597
16599 EVT VT1 = Val.getValueType();
16600 if (isZExtFree(VT1, VT2)) {
16601 return true;
16602 }
16603
16604 if (Val.getOpcode() != ISD::LOAD)
16605 return false;
16606
16607 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16608 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16609 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16610 VT1.getSizeInBits() <= 32);
16611}
16612
16613bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
16614 if (isa<FPExtInst>(Ext))
16615 return false;
16616
16617 // Vector types are not free.
16618 if (Ext->getType()->isVectorTy())
16619 return false;
16620
16621 for (const Use &U : Ext->uses()) {
16622 // The extension is free if we can fold it with a left shift in an
16623 // addressing mode or an arithmetic operation: add, sub, and cmp.
16624
16625 // Is there a shift?
16626 const Instruction *Instr = cast<Instruction>(U.getUser());
16627
16628 // Is this a constant shift?
16629 switch (Instr->getOpcode()) {
16630 case Instruction::Shl:
16631 if (!isa<ConstantInt>(Instr->getOperand(1)))
16632 return false;
16633 break;
16634 case Instruction::GetElementPtr: {
16635 gep_type_iterator GTI = gep_type_begin(Instr);
16636 auto &DL = Ext->getDataLayout();
16637 std::advance(GTI, U.getOperandNo()-1);
16638 Type *IdxTy = GTI.getIndexedType();
16639 // This extension will end up with a shift because of the scaling factor.
16640 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16641 // Get the shift amount based on the scaling factor:
16642 // log2(sizeof(IdxTy)) - log2(8).
16643 if (IdxTy->isScalableTy())
16644 return false;
16645 uint64_t ShiftAmt =
16646 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
16647 3;
16648 // Is the constant foldable in the shift of the addressing mode?
16649 // I.e., shift amount is between 1 and 4 inclusive.
16650 if (ShiftAmt == 0 || ShiftAmt > 4)
16651 return false;
16652 break;
16653 }
16654 case Instruction::Trunc:
16655 // Check if this is a noop.
16656 // trunc(sext ty1 to ty2) to ty1.
16657 if (Instr->getType() == Ext->getOperand(0)->getType())
16658 continue;
16659 [[fallthrough]];
16660 default:
16661 return false;
16662 }
16663
16664 // At this point we can use the bfm family, so this extension is free
16665 // for that use.
16666 }
16667 return true;
16668}
16669
16670static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16671 unsigned NumElts, bool IsLittleEndian,
16672 SmallVectorImpl<int> &Mask) {
16673 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
16674 return false;
16675
16676 assert(DstWidth % SrcWidth == 0 &&
16677 "TBL lowering is not supported for a conversion instruction with this "
16678 "source and destination element type.");
16679
16680 unsigned Factor = DstWidth / SrcWidth;
16681 unsigned MaskLen = NumElts * Factor;
16682
16683 Mask.clear();
16684 Mask.resize(MaskLen, NumElts);
16685
16686 unsigned SrcIndex = 0;
16687 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16688 Mask[I] = SrcIndex++;
16689
16690 return true;
16691}
16692
16694 FixedVectorType *ZExtTy,
16695 FixedVectorType *DstTy,
16696 bool IsLittleEndian) {
16697 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16698 unsigned NumElts = SrcTy->getNumElements();
16699 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16700 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16701
16702 SmallVector<int> Mask;
16703 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16704 return nullptr;
16705
16706 auto *FirstEltZero = Builder.CreateInsertElement(
16707 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16708 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16709 Result = Builder.CreateBitCast(Result, DstTy);
16710 if (DstTy != ZExtTy)
16711 Result = Builder.CreateZExt(Result, ZExtTy);
16712 return Result;
16713}
16714
16716 FixedVectorType *DstTy,
16717 bool IsLittleEndian) {
16718 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16719 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16720 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16721
16722 SmallVector<int> Mask;
16723 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
16724 !IsLittleEndian, Mask))
16725 return nullptr;
16726
16727 auto *FirstEltZero = Builder.CreateInsertElement(
16728 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
16729
16730 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16731}
16732
16733static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16734 IRBuilder<> Builder(TI);
16736 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16737 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16738 auto *DstTy = cast<FixedVectorType>(TI->getType());
16739 assert(SrcTy->getElementType()->isIntegerTy() &&
16740 "Non-integer type source vector element is not supported");
16741 assert(DstTy->getElementType()->isIntegerTy(8) &&
16742 "Unsupported destination vector element type");
16743 unsigned SrcElemTySz =
16744 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16745 unsigned DstElemTySz =
16746 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16747 assert((SrcElemTySz % DstElemTySz == 0) &&
16748 "Cannot lower truncate to tbl instructions for a source element size "
16749 "that is not divisible by the destination element size");
16750 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16751 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16752 "Unsupported source vector element type size");
16753 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16754
16755 // Create a mask to choose every nth byte from the source vector table of
16756 // bytes to create the truncated destination vector, where 'n' is the truncate
16757 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16758 // 0,8,16,..Y*8th bytes for the little-endian format
16760 for (int Itr = 0; Itr < 16; Itr++) {
16761 if (Itr < NumElements)
16762 MaskConst.push_back(Builder.getInt8(
16763 IsLittleEndian ? Itr * TruncFactor
16764 : Itr * TruncFactor + (TruncFactor - 1)));
16765 else
16766 MaskConst.push_back(Builder.getInt8(255));
16767 }
16768
16769 int MaxTblSz = 128 * 4;
16770 int MaxSrcSz = SrcElemTySz * NumElements;
16771 int ElemsPerTbl =
16772 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16773 assert(ElemsPerTbl <= 16 &&
16774 "Maximum elements selected using TBL instruction cannot exceed 16!");
16775
16776 int ShuffleCount = 128 / SrcElemTySz;
16777 SmallVector<int> ShuffleLanes;
16778 for (int i = 0; i < ShuffleCount; ++i)
16779 ShuffleLanes.push_back(i);
16780
16781 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16782 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16783 // call TBL & save the result in a vector of TBL results for combining later.
16785 while (ShuffleLanes.back() < NumElements) {
16786 Parts.push_back(Builder.CreateBitCast(
16787 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16788
16789 if (Parts.size() == 4) {
16790 Parts.push_back(ConstantVector::get(MaskConst));
16791 Results.push_back(
16792 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
16793 Parts.clear();
16794 }
16795
16796 for (int i = 0; i < ShuffleCount; ++i)
16797 ShuffleLanes[i] += ShuffleCount;
16798 }
16799
16800 assert((Parts.empty() || Results.empty()) &&
16801 "Lowering trunc for vectors requiring different TBL instructions is "
16802 "not supported!");
16803 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16804 // registers
16805 if (!Parts.empty()) {
16806 Intrinsic::ID TblID;
16807 switch (Parts.size()) {
16808 case 1:
16809 TblID = Intrinsic::aarch64_neon_tbl1;
16810 break;
16811 case 2:
16812 TblID = Intrinsic::aarch64_neon_tbl2;
16813 break;
16814 case 3:
16815 TblID = Intrinsic::aarch64_neon_tbl3;
16816 break;
16817 }
16818
16819 Parts.push_back(ConstantVector::get(MaskConst));
16820 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
16821 }
16822
16823 // Extract the destination vector from TBL result(s) after combining them
16824 // where applicable. Currently, at most two TBLs are supported.
16825 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16826 "more than 2 tbl instructions!");
16827 Value *FinalResult = Results[0];
16828 if (Results.size() == 1) {
16829 if (ElemsPerTbl < 16) {
16830 SmallVector<int> FinalMask(ElemsPerTbl);
16831 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16832 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16833 }
16834 } else {
16835 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16836 if (ElemsPerTbl < 16) {
16837 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16838 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16839 } else {
16840 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16841 }
16842 FinalResult =
16843 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16844 }
16845
16846 TI->replaceAllUsesWith(FinalResult);
16847 TI->eraseFromParent();
16848}
16849
16851 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16852 // shuffle_vector instructions are serialized when targeting SVE,
16853 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16854 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16855 return false;
16856
16857 // Try to optimize conversions using tbl. This requires materializing constant
16858 // index vectors, which can increase code size and add loads. Skip the
16859 // transform unless the conversion is in a loop block guaranteed to execute
16860 // and we are not optimizing for size.
16861 Function *F = I->getParent()->getParent();
16862 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16863 F->hasOptSize())
16864 return false;
16865
16866 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16867 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16868 if (!SrcTy || !DstTy)
16869 return false;
16870
16871 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16872 // lowered to tbl instructions to insert the original i8 elements
16873 // into i8x lanes. This is enabled for cases where it is beneficial.
16874 auto *ZExt = dyn_cast<ZExtInst>(I);
16875 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16876 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16877 if (DstWidth % 8 != 0)
16878 return false;
16879
16880 auto *TruncDstType =
16881 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16882 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16883 // the remaining ZExt folded into the user, don't use tbl lowering.
16884 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16885 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16888 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16889 return false;
16890
16891 DstTy = TruncDstType;
16892 }
16893
16894 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16895 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
16896 // most one extra extend step is needed and using tbl is not profitable.
16897 if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
16898 auto *SingleUser = cast<Instruction>(*I->user_begin());
16899 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16900 return false;
16901 }
16902
16903 if (DstTy->getScalarSizeInBits() >= 64)
16904 return false;
16905
16906 IRBuilder<> Builder(ZExt);
16908 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16909 DstTy, Subtarget->isLittleEndian());
16910 if (!Result)
16911 return false;
16912 ZExt->replaceAllUsesWith(Result);
16913 ZExt->eraseFromParent();
16914 return true;
16915 }
16916
16917 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16918 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
16919 DstTy->getElementType()->isFloatTy()) ||
16920 (SrcTy->getElementType()->isIntegerTy(16) &&
16921 DstTy->getElementType()->isDoubleTy()))) {
16922 IRBuilder<> Builder(I);
16924 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16925 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16926 assert(ZExt && "Cannot fail for the i8 to float conversion");
16927 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16928 I->replaceAllUsesWith(UI);
16929 I->eraseFromParent();
16930 return true;
16931 }
16932
16933 auto *SIToFP = dyn_cast<SIToFPInst>(I);
16934 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16935 DstTy->getElementType()->isFloatTy()) {
16936 IRBuilder<> Builder(I);
16937 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16939 Subtarget->isLittleEndian());
16940 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16941 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16942 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16943 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16944 I->replaceAllUsesWith(SI);
16945 I->eraseFromParent();
16946 return true;
16947 }
16948
16949 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16950 // followed by a truncate lowered to using tbl.4.
16951 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16952 if (FPToUI &&
16953 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16954 SrcTy->getElementType()->isFloatTy() &&
16955 DstTy->getElementType()->isIntegerTy(8)) {
16956 IRBuilder<> Builder(I);
16957 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16958 VectorType::getInteger(SrcTy));
16959 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16960 I->replaceAllUsesWith(TruncI);
16961 I->eraseFromParent();
16962 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16963 return true;
16964 }
16965
16966 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16967 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16968 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16969 // registers
16970 auto *TI = dyn_cast<TruncInst>(I);
16971 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16972 ((SrcTy->getElementType()->isIntegerTy(32) ||
16973 SrcTy->getElementType()->isIntegerTy(64)) &&
16974 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16975 createTblForTrunc(TI, Subtarget->isLittleEndian());
16976 return true;
16977 }
16978
16979 return false;
16980}
16981
16983 Align &RequiredAligment) const {
16984 if (!LoadedType.isSimple() ||
16985 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16986 return false;
16987 // Cyclone supports unaligned accesses.
16988 RequiredAligment = Align(1);
16989 unsigned NumBits = LoadedType.getSizeInBits();
16990 return NumBits == 32 || NumBits == 64;
16991}
16992
16993/// A helper function for determining the number of interleaved accesses we
16994/// will generate when lowering accesses of the given type.
16996 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16997 unsigned VecSize = 128;
16998 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16999 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17000 if (UseScalable && isa<FixedVectorType>(VecTy))
17001 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17002 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17003}
17004
17007 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17008 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17009 return MOStridedAccess;
17011}
17012
17014 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17015 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17016 auto EC = VecTy->getElementCount();
17017 unsigned MinElts = EC.getKnownMinValue();
17018
17019 UseScalable = false;
17020
17021 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17022 (!Subtarget->useSVEForFixedLengthVectors() ||
17024 return false;
17025
17026 if (isa<ScalableVectorType>(VecTy) &&
17027 !Subtarget->isSVEorStreamingSVEAvailable())
17028 return false;
17029
17030 // Ensure the number of vector elements is greater than 1.
17031 if (MinElts < 2)
17032 return false;
17033
17034 // Ensure the element type is legal.
17035 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17036 return false;
17037
17038 if (EC.isScalable()) {
17039 UseScalable = true;
17040 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17041 }
17042
17043 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17044 if (Subtarget->useSVEForFixedLengthVectors()) {
17045 unsigned MinSVEVectorSize =
17046 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17047 if (VecSize % MinSVEVectorSize == 0 ||
17048 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17049 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17050 UseScalable = true;
17051 return true;
17052 }
17053 }
17054
17055 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17056 // 128 will be split into multiple interleaved accesses.
17057 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17058}
17059
17061 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17062 return ScalableVectorType::get(VTy->getElementType(), 2);
17063
17064 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17065 return ScalableVectorType::get(VTy->getElementType(), 4);
17066
17067 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17068 return ScalableVectorType::get(VTy->getElementType(), 8);
17069
17070 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17071 return ScalableVectorType::get(VTy->getElementType(), 8);
17072
17073 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17074 return ScalableVectorType::get(VTy->getElementType(), 2);
17075
17076 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17077 return ScalableVectorType::get(VTy->getElementType(), 4);
17078
17079 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17080 return ScalableVectorType::get(VTy->getElementType(), 8);
17081
17082 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17083 return ScalableVectorType::get(VTy->getElementType(), 16);
17084
17085 llvm_unreachable("Cannot handle input vector type");
17086}
17087
17088static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17089 bool Scalable, Type *LDVTy,
17090 Type *PtrTy) {
17091 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17092 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17093 Intrinsic::aarch64_sve_ld3_sret,
17094 Intrinsic::aarch64_sve_ld4_sret};
17095 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17096 Intrinsic::aarch64_neon_ld3,
17097 Intrinsic::aarch64_neon_ld4};
17098 if (Scalable)
17099 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17100
17101 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17102 {LDVTy, PtrTy});
17103}
17104
17105static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17106 bool Scalable, Type *STVTy,
17107 Type *PtrTy) {
17108 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17109 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17110 Intrinsic::aarch64_sve_st3,
17111 Intrinsic::aarch64_sve_st4};
17112 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17113 Intrinsic::aarch64_neon_st3,
17114 Intrinsic::aarch64_neon_st4};
17115 if (Scalable)
17116 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17117
17118 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17119 {STVTy, PtrTy});
17120}
17121
17122/// Lower an interleaved load into a ldN intrinsic.
17123///
17124/// E.g. Lower an interleaved load (Factor = 2):
17125/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17126/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17127/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17128///
17129/// Into:
17130/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17131/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17132/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17135 ArrayRef<unsigned> Indices, unsigned Factor) const {
17136 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17137 "Invalid interleave factor");
17138 assert(!Shuffles.empty() && "Empty shufflevector input");
17139 assert(Shuffles.size() == Indices.size() &&
17140 "Unmatched number of shufflevectors and indices");
17141
17142 const DataLayout &DL = LI->getDataLayout();
17143
17144 VectorType *VTy = Shuffles[0]->getType();
17145
17146 // Skip if we do not have NEON and skip illegal vector types. We can
17147 // "legalize" wide vector types into multiple interleaved accesses as long as
17148 // the vector types are divisible by 128.
17149 bool UseScalable;
17150 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17151 return false;
17152
17153 // Check if the interleave is a zext(shuffle), that can be better optimized
17154 // into shift / and masks. For the moment we do this just for uitofp (not
17155 // zext) to avoid issues with widening instructions.
17156 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17157 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17158 SI->getType()->getScalarSizeInBits() * 4 ==
17159 SI->user_back()->getType()->getScalarSizeInBits();
17160 }))
17161 return false;
17162
17163 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17164
17165 auto *FVTy = cast<FixedVectorType>(VTy);
17166
17167 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17168 // load integer vectors first and then convert to pointer vectors.
17169 Type *EltTy = FVTy->getElementType();
17170 if (EltTy->isPointerTy())
17171 FVTy =
17172 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17173
17174 // If we're going to generate more than one load, reset the sub-vector type
17175 // to something legal.
17176 FVTy = FixedVectorType::get(FVTy->getElementType(),
17177 FVTy->getNumElements() / NumLoads);
17178
17179 auto *LDVTy =
17180 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17181
17182 IRBuilder<> Builder(LI);
17183
17184 // The base address of the load.
17185 Value *BaseAddr = LI->getPointerOperand();
17186
17187 Type *PtrTy = LI->getPointerOperandType();
17188 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17189 LDVTy->getElementCount());
17190
17191 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17192 UseScalable, LDVTy, PtrTy);
17193
17194 // Holds sub-vectors extracted from the load intrinsic return values. The
17195 // sub-vectors are associated with the shufflevector instructions they will
17196 // replace.
17198
17199 Value *PTrue = nullptr;
17200 if (UseScalable) {
17201 std::optional<unsigned> PgPattern =
17202 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17203 if (Subtarget->getMinSVEVectorSizeInBits() ==
17204 Subtarget->getMaxSVEVectorSizeInBits() &&
17205 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17206 PgPattern = AArch64SVEPredPattern::all;
17207
17208 auto *PTruePat =
17209 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17210 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17211 {PTruePat});
17212 }
17213
17214 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17215
17216 // If we're generating more than one load, compute the base address of
17217 // subsequent loads as an offset from the previous.
17218 if (LoadCount > 0)
17219 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17220 FVTy->getNumElements() * Factor);
17221
17222 CallInst *LdN;
17223 if (UseScalable)
17224 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17225 else
17226 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17227
17228 // Extract and store the sub-vectors returned by the load intrinsic.
17229 for (unsigned i = 0; i < Shuffles.size(); i++) {
17230 ShuffleVectorInst *SVI = Shuffles[i];
17231 unsigned Index = Indices[i];
17232
17233 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17234
17235 if (UseScalable)
17236 SubVec = Builder.CreateExtractVector(
17237 FVTy, SubVec,
17238 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
17239
17240 // Convert the integer vector to pointer vector if the element is pointer.
17241 if (EltTy->isPointerTy())
17242 SubVec = Builder.CreateIntToPtr(
17244 FVTy->getNumElements()));
17245
17246 SubVecs[SVI].push_back(SubVec);
17247 }
17248 }
17249
17250 // Replace uses of the shufflevector instructions with the sub-vectors
17251 // returned by the load intrinsic. If a shufflevector instruction is
17252 // associated with more than one sub-vector, those sub-vectors will be
17253 // concatenated into a single wide vector.
17254 for (ShuffleVectorInst *SVI : Shuffles) {
17255 auto &SubVec = SubVecs[SVI];
17256 auto *WideVec =
17257 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17258 SVI->replaceAllUsesWith(WideVec);
17259 }
17260
17261 return true;
17262}
17263
17264template <typename Iter>
17265bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17266 int MaxLookupDist = 20;
17267 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17268 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17269 const Value *PtrA1 =
17270 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17271
17272 while (++It != End) {
17273 if (It->isDebugOrPseudoInst())
17274 continue;
17275 if (MaxLookupDist-- == 0)
17276 break;
17277 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17278 const Value *PtrB1 =
17279 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17280 DL, OffsetB);
17281 if (PtrA1 == PtrB1 &&
17282 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17283 .abs() == 16)
17284 return true;
17285 }
17286 }
17287
17288 return false;
17289}
17290
17291/// Lower an interleaved store into a stN intrinsic.
17292///
17293/// E.g. Lower an interleaved store (Factor = 3):
17294/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17295/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17296/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17297///
17298/// Into:
17299/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17300/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17301/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17302/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17303///
17304/// Note that the new shufflevectors will be removed and we'll only generate one
17305/// st3 instruction in CodeGen.
17306///
17307/// Example for a more general valid mask (Factor 3). Lower:
17308/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17309/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17310/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17311///
17312/// Into:
17313/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17314/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17315/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17316/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17318 ShuffleVectorInst *SVI,
17319 unsigned Factor) const {
17320
17321 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17322 "Invalid interleave factor");
17323
17324 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17325 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17326
17327 unsigned LaneLen = VecTy->getNumElements() / Factor;
17328 Type *EltTy = VecTy->getElementType();
17329 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17330
17331 const DataLayout &DL = SI->getDataLayout();
17332 bool UseScalable;
17333
17334 // Skip if we do not have NEON and skip illegal vector types. We can
17335 // "legalize" wide vector types into multiple interleaved accesses as long as
17336 // the vector types are divisible by 128.
17337 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17338 return false;
17339
17340 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17341
17342 Value *Op0 = SVI->getOperand(0);
17343 Value *Op1 = SVI->getOperand(1);
17344 IRBuilder<> Builder(SI);
17345
17346 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17347 // vectors to integer vectors.
17348 if (EltTy->isPointerTy()) {
17349 Type *IntTy = DL.getIntPtrType(EltTy);
17350 unsigned NumOpElts =
17351 cast<FixedVectorType>(Op0->getType())->getNumElements();
17352
17353 // Convert to the corresponding integer vector.
17354 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17355 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17356 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17357
17358 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17359 }
17360
17361 // If we're going to generate more than one store, reset the lane length
17362 // and sub-vector type to something legal.
17363 LaneLen /= NumStores;
17364 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17365
17366 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17367 : SubVecTy;
17368
17369 // The base address of the store.
17370 Value *BaseAddr = SI->getPointerOperand();
17371
17372 auto Mask = SVI->getShuffleMask();
17373
17374 // Sanity check if all the indices are NOT in range.
17375 // If mask is `poison`, `Mask` may be a vector of -1s.
17376 // If all of them are `poison`, OOB read will happen later.
17377 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17378 return false;
17379 }
17380 // A 64bit st2 which does not start at element 0 will involved adding extra
17381 // ext elements making the st2 unprofitable, and if there is a nearby store
17382 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17383 // zip;ldp pair which has higher throughput.
17384 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17385 (Mask[0] != 0 ||
17386 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17387 DL) ||
17388 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17389 BaseAddr, DL)))
17390 return false;
17391
17392 Type *PtrTy = SI->getPointerOperandType();
17393 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
17394 STVTy->getElementCount());
17395
17396 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17397 UseScalable, STVTy, PtrTy);
17398
17399 Value *PTrue = nullptr;
17400 if (UseScalable) {
17401 std::optional<unsigned> PgPattern =
17402 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
17403 if (Subtarget->getMinSVEVectorSizeInBits() ==
17404 Subtarget->getMaxSVEVectorSizeInBits() &&
17405 Subtarget->getMinSVEVectorSizeInBits() ==
17406 DL.getTypeSizeInBits(SubVecTy))
17407 PgPattern = AArch64SVEPredPattern::all;
17408
17409 auto *PTruePat =
17410 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
17411 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17412 {PTruePat});
17413 }
17414
17415 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17416
17418
17419 // Split the shufflevector operands into sub vectors for the new stN call.
17420 for (unsigned i = 0; i < Factor; i++) {
17421 Value *Shuffle;
17422 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17423 if (Mask[IdxI] >= 0) {
17424 Shuffle = Builder.CreateShuffleVector(
17425 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17426 } else {
17427 unsigned StartMask = 0;
17428 for (unsigned j = 1; j < LaneLen; j++) {
17429 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17430 if (Mask[IdxJ] >= 0) {
17431 StartMask = Mask[IdxJ] - j;
17432 break;
17433 }
17434 }
17435 // Note: Filling undef gaps with random elements is ok, since
17436 // those elements were being written anyway (with undefs).
17437 // In the case of all undefs we're defaulting to using elems from 0
17438 // Note: StartMask cannot be negative, it's checked in
17439 // isReInterleaveMask
17440 Shuffle = Builder.CreateShuffleVector(
17441 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17442 }
17443
17444 if (UseScalable)
17445 Shuffle = Builder.CreateInsertVector(
17446 STVTy, UndefValue::get(STVTy), Shuffle,
17447 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
17448
17449 Ops.push_back(Shuffle);
17450 }
17451
17452 if (UseScalable)
17453 Ops.push_back(PTrue);
17454
17455 // If we generating more than one store, we compute the base address of
17456 // subsequent stores as an offset from the previous.
17457 if (StoreCount > 0)
17458 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17459 BaseAddr, LaneLen * Factor);
17460
17461 Ops.push_back(BaseAddr);
17462 Builder.CreateCall(StNFunc, Ops);
17463 }
17464 return true;
17465}
17466
17468 LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
17469 unsigned Factor = DeinterleavedValues.size();
17470 if (Factor != 2 && Factor != 4) {
17471 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17472 return false;
17473 }
17474
17475 VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17476
17477 const DataLayout &DL = LI->getModule()->getDataLayout();
17478 bool UseScalable;
17479 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17480 return false;
17481
17482 // TODO: Add support for using SVE instructions with fixed types later, using
17483 // the code from lowerInterleavedLoad to obtain the correct container type.
17484 if (UseScalable && !VTy->isScalableTy())
17485 return false;
17486
17487 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17488 VectorType *LdTy =
17490 VTy->getElementCount().divideCoefficientBy(NumLoads));
17491
17492 Type *PtrTy = LI->getPointerOperandType();
17493 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17494 UseScalable, LdTy, PtrTy);
17495
17496 IRBuilder<> Builder(LI);
17497 Value *Pred = nullptr;
17498 if (UseScalable)
17499 Pred =
17500 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17501
17502 Value *BaseAddr = LI->getPointerOperand();
17503 if (NumLoads > 1) {
17504 // Create multiple legal small ldN.
17505 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
17506 for (unsigned I = 0; I < NumLoads; ++I) {
17507 Value *Offset = Builder.getInt64(I * Factor);
17508
17509 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17510 Value *LdN = nullptr;
17511 if (UseScalable)
17512 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17513 else
17514 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17515 Value *Idx =
17516 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17517 for (unsigned J = 0; J < Factor; ++J) {
17518 ExtractedLdValues[J] = Builder.CreateInsertVector(
17519 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
17520 }
17521 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17522 }
17523 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17524 for (unsigned J = 0; J < Factor; ++J)
17525 DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17526 } else {
17527 Value *Result;
17528 if (UseScalable)
17529 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17530 else
17531 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17532 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17533 for (unsigned I = 0; I < Factor; I++) {
17534 Value *NewExtract = Builder.CreateExtractValue(Result, I);
17535 DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17536 }
17537 }
17538 return true;
17539}
17540
17542 StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
17543 unsigned Factor = InterleavedValues.size();
17544 if (Factor != 2 && Factor != 4) {
17545 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17546 return false;
17547 }
17548
17549 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
17550 const DataLayout &DL = SI->getModule()->getDataLayout();
17551
17552 bool UseScalable;
17553 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17554 return false;
17555
17556 // TODO: Add support for using SVE instructions with fixed types later, using
17557 // the code from lowerInterleavedStore to obtain the correct container type.
17558 if (UseScalable && !VTy->isScalableTy())
17559 return false;
17560
17561 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17562
17563 VectorType *StTy =
17565 VTy->getElementCount().divideCoefficientBy(NumStores));
17566
17567 Type *PtrTy = SI->getPointerOperandType();
17568 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17569 UseScalable, StTy, PtrTy);
17570
17571 IRBuilder<> Builder(SI);
17572
17573 Value *BaseAddr = SI->getPointerOperand();
17574 Value *Pred = nullptr;
17575
17576 if (UseScalable)
17577 Pred =
17578 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17579
17580 auto ExtractedValues = InterleavedValues;
17581 SmallVector<Value *, 4> StoreOperands(InterleavedValues.begin(),
17582 InterleavedValues.end());
17583 if (UseScalable)
17584 StoreOperands.push_back(Pred);
17585 StoreOperands.push_back(BaseAddr);
17586 for (unsigned I = 0; I < NumStores; ++I) {
17587 Value *Address = BaseAddr;
17588 if (NumStores > 1) {
17589 Value *Offset = Builder.getInt64(I * Factor);
17590 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17591 Value *Idx =
17592 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17593 for (unsigned J = 0; J < Factor; J++) {
17594 StoreOperands[J] =
17595 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
17596 }
17597 // update the address
17598 StoreOperands[StoreOperands.size() - 1] = Address;
17599 }
17600 Builder.CreateCall(StNFunc, StoreOperands);
17601 }
17602 return true;
17603}
17604
17606 const MemOp &Op, const AttributeList &FuncAttributes) const {
17607 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17608 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17609 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17610 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17611 // taken one instruction to materialize the v2i64 zero and one store (with
17612 // restrictive addressing mode). Just do i64 stores.
17613 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17614 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17615 if (Op.isAligned(AlignCheck))
17616 return true;
17617 unsigned Fast;
17618 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17620 Fast;
17621 };
17622
17623 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17624 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17625 return MVT::v16i8;
17626 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17627 return MVT::f128;
17628 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17629 return MVT::i64;
17630 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17631 return MVT::i32;
17632 return MVT::Other;
17633}
17634
17636 const MemOp &Op, const AttributeList &FuncAttributes) const {
17637 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17638 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17639 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17640 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17641 // taken one instruction to materialize the v2i64 zero and one store (with
17642 // restrictive addressing mode). Just do i64 stores.
17643 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17644 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17645 if (Op.isAligned(AlignCheck))
17646 return true;
17647 unsigned Fast;
17648 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17650 Fast;
17651 };
17652
17653 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17654 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17655 return LLT::fixed_vector(2, 64);
17656 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17657 return LLT::scalar(128);
17658 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17659 return LLT::scalar(64);
17660 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17661 return LLT::scalar(32);
17662 return LLT();
17663}
17664
17665// 12-bit optionally shifted immediates are legal for adds.
17667 if (Immed == std::numeric_limits<int64_t>::min()) {
17668 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17669 << ": avoid UB for INT64_MIN\n");
17670 return false;
17671 }
17672 // Same encoding for add/sub, just flip the sign.
17673 Immed = std::abs(Immed);
17674 bool IsLegal = ((Immed >> 12) == 0 ||
17675 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17676 LLVM_DEBUG(dbgs() << "Is " << Immed
17677 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17678 return IsLegal;
17679}
17680
17682 // We will only emit addvl/inc* instructions for SVE2
17683 if (!Subtarget->hasSVE2())
17684 return false;
17685
17686 // addvl's immediates are in terms of the number of bytes in a register.
17687 // Since there are 16 in the base supported size (128bits), we need to
17688 // divide the immediate by that much to give us a useful immediate to
17689 // multiply by vscale. We can't have a remainder as a result of this.
17690 if (Imm % 16 == 0)
17691 return isInt<6>(Imm / 16);
17692
17693 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17694 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17695 // of addvl as a result, so only take h|w|d into account.
17696 // Dec[h|w|d] will cover subtractions.
17697 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17698 // FIXME: Can we make use of other patterns to cover other immediates?
17699
17700 // inch|dech
17701 if (Imm % 8 == 0)
17702 return std::abs(Imm / 8) <= 16;
17703 // incw|decw
17704 if (Imm % 4 == 0)
17705 return std::abs(Imm / 4) <= 16;
17706 // incd|decd
17707 if (Imm % 2 == 0)
17708 return std::abs(Imm / 2) <= 16;
17709
17710 return false;
17711}
17712
17713// Return false to prevent folding
17714// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17715// if the folding leads to worse code.
17717 SDValue AddNode, SDValue ConstNode) const {
17718 // Let the DAGCombiner decide for vector types and large types.
17719 const EVT VT = AddNode.getValueType();
17720 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17721 return true;
17722
17723 // It is worse if c1 is legal add immediate, while c1*c2 is not
17724 // and has to be composed by at least two instructions.
17725 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17726 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17727 const int64_t C1 = C1Node->getSExtValue();
17728 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17730 return true;
17732 // Adapt to the width of a register.
17733 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17735 if (Insn.size() > 1)
17736 return false;
17737
17738 // Default to true and let the DAGCombiner decide.
17739 return true;
17740}
17741
17742// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17743// immediates is the same as for an add or a sub.
17745 return isLegalAddImmediate(Immed);
17746}
17747
17748/// isLegalAddressingMode - Return true if the addressing mode represented
17749/// by AM is legal for this target, for a load/store of the specified type.
17751 const AddrMode &AMode, Type *Ty,
17752 unsigned AS, Instruction *I) const {
17753 // AArch64 has five basic addressing modes:
17754 // reg
17755 // reg + 9-bit signed offset
17756 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17757 // reg1 + reg2
17758 // reg + SIZE_IN_BYTES * reg
17759
17760 // No global is ever allowed as a base.
17761 if (AMode.BaseGV)
17762 return false;
17763
17764 // No reg+reg+imm addressing.
17765 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17766 return false;
17767
17768 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17769 // `2*ScaledReg` into `BaseReg + ScaledReg`
17770 AddrMode AM = AMode;
17771 if (AM.Scale && !AM.HasBaseReg) {
17772 if (AM.Scale == 1) {
17773 AM.HasBaseReg = true;
17774 AM.Scale = 0;
17775 } else if (AM.Scale == 2) {
17776 AM.HasBaseReg = true;
17777 AM.Scale = 1;
17778 } else {
17779 return false;
17780 }
17781 }
17782
17783 // A base register is required in all addressing modes.
17784 if (!AM.HasBaseReg)
17785 return false;
17786
17787 if (Ty->isScalableTy()) {
17788 if (isa<ScalableVectorType>(Ty)) {
17789 // See if we have a foldable vscale-based offset, for vector types which
17790 // are either legal or smaller than the minimum; more work will be
17791 // required if we need to consider addressing for types which need
17792 // legalization by splitting.
17793 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17794 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17795 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17796 isPowerOf2_64(VecNumBytes))
17797 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17798
17799 uint64_t VecElemNumBytes =
17800 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17801 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17802 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17803 }
17804
17805 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17806 }
17807
17808 // No scalable offsets allowed for non-scalable types.
17809 if (AM.ScalableOffset)
17810 return false;
17811
17812 // check reg + imm case:
17813 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17814 uint64_t NumBytes = 0;
17815 if (Ty->isSized()) {
17816 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17817 NumBytes = NumBits / 8;
17818 if (!isPowerOf2_64(NumBits))
17819 NumBytes = 0;
17820 }
17821
17822 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
17823 AM.Scale);
17824}
17825
17826// Check whether the 2 offsets belong to the same imm24 range, and their high
17827// 12bits are same, then their high part can be decoded with the offset of add.
17828int64_t
17830 int64_t MaxOffset) const {
17831 int64_t HighPart = MinOffset & ~0xfffULL;
17832 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
17833 // Rebase the value to an integer multiple of imm12.
17834 return HighPart;
17835 }
17836
17837 return 0;
17838}
17839
17841 // Consider splitting large offset of struct or array.
17842 return true;
17843}
17844
17846 const MachineFunction &MF, EVT VT) const {
17847 VT = VT.getScalarType();
17848
17849 if (!VT.isSimple())
17850 return false;
17851
17852 switch (VT.getSimpleVT().SimpleTy) {
17853 case MVT::f16:
17854 return Subtarget->hasFullFP16();
17855 case MVT::f32:
17856 case MVT::f64:
17857 return true;
17858 default:
17859 break;
17860 }
17861
17862 return false;
17863}
17864
17866 Type *Ty) const {
17867 switch (Ty->getScalarType()->getTypeID()) {
17868 case Type::FloatTyID:
17869 case Type::DoubleTyID:
17870 return true;
17871 default:
17872 return false;
17873 }
17874}
17875
17877 EVT VT, CodeGenOptLevel OptLevel) const {
17878 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17880}
17881
17882const MCPhysReg *
17884 // LR is a callee-save register, but we must treat it as clobbered by any call
17885 // site. Hence we include LR in the scratch registers, which are in turn added
17886 // as implicit-defs for stackmaps and patchpoints.
17887 static const MCPhysReg ScratchRegs[] = {
17888 AArch64::X16, AArch64::X17, AArch64::LR, 0
17889 };
17890 return ScratchRegs;
17891}
17892
17894 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17895 return RCRegs;
17896}
17897
17898bool
17900 CombineLevel Level) const {
17901 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17902 N->getOpcode() == ISD::SRL) &&
17903 "Expected shift op");
17904
17905 SDValue ShiftLHS = N->getOperand(0);
17906 EVT VT = N->getValueType(0);
17907
17908 if (!ShiftLHS->hasOneUse())
17909 return false;
17910
17911 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
17912 !ShiftLHS.getOperand(0)->hasOneUse())
17913 return false;
17914
17915 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17916 // combine it with shift 'N' to let it be lowered to UBFX except:
17917 // ((x >> C) & mask) << C.
17918 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17919 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
17920 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
17921 if (isMask_64(TruncMask)) {
17922 SDValue AndLHS = ShiftLHS.getOperand(0);
17923 if (AndLHS.getOpcode() == ISD::SRL) {
17924 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
17925 if (N->getOpcode() == ISD::SHL)
17926 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
17927 return SRLC->getZExtValue() == SHLC->getZExtValue();
17928 return false;
17929 }
17930 }
17931 }
17932 }
17933 return true;
17934}
17935
17937 const SDNode *N) const {
17938 assert(N->getOpcode() == ISD::XOR &&
17939 (N->getOperand(0).getOpcode() == ISD::SHL ||
17940 N->getOperand(0).getOpcode() == ISD::SRL) &&
17941 "Expected XOR(SHIFT) pattern");
17942
17943 // Only commute if the entire NOT mask is a hidden shifted mask.
17944 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
17945 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17946 if (XorC && ShiftC) {
17947 unsigned MaskIdx, MaskLen;
17948 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17949 unsigned ShiftAmt = ShiftC->getZExtValue();
17950 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17951 if (N->getOperand(0).getOpcode() == ISD::SHL)
17952 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17953 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17954 }
17955 }
17956
17957 return false;
17958}
17959
17961 const SDNode *N, CombineLevel Level) const {
17962 assert(((N->getOpcode() == ISD::SHL &&
17963 N->getOperand(0).getOpcode() == ISD::SRL) ||
17964 (N->getOpcode() == ISD::SRL &&
17965 N->getOperand(0).getOpcode() == ISD::SHL)) &&
17966 "Expected shift-shift mask");
17967 // Don't allow multiuse shift folding with the same shift amount.
17968 if (!N->getOperand(0)->hasOneUse())
17969 return false;
17970
17971 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17972 EVT VT = N->getValueType(0);
17973 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17974 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17975 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17976 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17977 }
17978
17979 // We do not need to fold when this shifting used in specific load case:
17980 // (ldr x, (add x, (shl (srl x, c1) 2)))
17981 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
17982 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
17983 unsigned ShlAmt = C2->getZExtValue();
17984 if (auto ShouldADD = *N->user_begin();
17985 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
17986 if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
17987 unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
17988 if ((1ULL << ShlAmt) == ByteVT &&
17989 isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
17990 return false;
17991 }
17992 }
17993 }
17994 }
17995
17996 return true;
17997}
17998
18000 unsigned BinOpcode, EVT VT) const {
18001 return VT.isScalableVector() && isTypeLegal(VT);
18002}
18003
18005 Type *Ty) const {
18006 assert(Ty->isIntegerTy());
18007
18008 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18009 if (BitSize == 0)
18010 return false;
18011
18012 int64_t Val = Imm.getSExtValue();
18013 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18014 return true;
18015
18016 if ((int64_t)Val < 0)
18017 Val = ~Val;
18018 if (BitSize == 32)
18019 Val &= (1LL << 32) - 1;
18020
18021 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18022 // MOVZ is free so return true for one or fewer MOVK.
18023 return Shift < 3;
18024}
18025
18027 unsigned Index) const {
18029 return false;
18030
18031 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18032}
18033
18034/// Turn vector tests of the signbit in the form of:
18035/// xor (sra X, elt_size(X)-1), -1
18036/// into:
18037/// cmge X, X, #0
18039 const AArch64Subtarget *Subtarget) {
18040 EVT VT = N->getValueType(0);
18041 if (!Subtarget->hasNEON() || !VT.isVector())
18042 return SDValue();
18043
18044 // There must be a shift right algebraic before the xor, and the xor must be a
18045 // 'not' operation.
18046 SDValue Shift = N->getOperand(0);
18047 SDValue Ones = N->getOperand(1);
18048 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18050 return SDValue();
18051
18052 // The shift should be smearing the sign bit across each vector element.
18053 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18054 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18055 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18056 return SDValue();
18057
18058 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
18059}
18060
18061// Given a vecreduce_add node, detect the below pattern and convert it to the
18062// node sequence with UABDL, [S|U]ADB and UADDLP.
18063//
18064// i32 vecreduce_add(
18065// v16i32 abs(
18066// v16i32 sub(
18067// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18068// =================>
18069// i32 vecreduce_add(
18070// v4i32 UADDLP(
18071// v8i16 add(
18072// v8i16 zext(
18073// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18074// v8i16 zext(
18075// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18077 SelectionDAG &DAG) {
18078 // Assumed i32 vecreduce_add
18079 if (N->getValueType(0) != MVT::i32)
18080 return SDValue();
18081
18082 SDValue VecReduceOp0 = N->getOperand(0);
18083 unsigned Opcode = VecReduceOp0.getOpcode();
18084 // Assumed v16i32 abs
18085 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
18086 return SDValue();
18087
18088 SDValue ABS = VecReduceOp0;
18089 // Assumed v16i32 sub
18090 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18091 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
18092 return SDValue();
18093
18094 SDValue SUB = ABS->getOperand(0);
18095 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18096 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18097 // Assumed v16i32 type
18098 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
18099 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
18100 return SDValue();
18101
18102 // Assumed zext or sext
18103 bool IsZExt = false;
18104 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18105 IsZExt = true;
18106 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18107 IsZExt = false;
18108 } else
18109 return SDValue();
18110
18111 SDValue EXT0 = SUB->getOperand(0);
18112 SDValue EXT1 = SUB->getOperand(1);
18113 // Assumed zext's operand has v16i8 type
18114 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18115 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18116 return SDValue();
18117
18118 // Pattern is dectected. Let's convert it to sequence of nodes.
18119 SDLoc DL(N);
18120
18121 // First, create the node pattern of UABD/SABD.
18122 SDValue UABDHigh8Op0 =
18123 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18124 DAG.getConstant(8, DL, MVT::i64));
18125 SDValue UABDHigh8Op1 =
18126 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18127 DAG.getConstant(8, DL, MVT::i64));
18128 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18129 UABDHigh8Op0, UABDHigh8Op1);
18130 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18131
18132 // Second, create the node pattern of UABAL.
18133 SDValue UABDLo8Op0 =
18134 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18135 DAG.getConstant(0, DL, MVT::i64));
18136 SDValue UABDLo8Op1 =
18137 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18138 DAG.getConstant(0, DL, MVT::i64));
18139 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18140 UABDLo8Op0, UABDLo8Op1);
18141 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18142 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18143
18144 // Third, create the node of UADDLP.
18145 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18146
18147 // Fourth, create the node of VECREDUCE_ADD.
18148 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18149}
18150
18151// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18152// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18153// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18154// If we have vectors larger than v16i8 we extract v16i8 vectors,
18155// Follow the same steps above to get DOT instructions concatenate them
18156// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18158 const AArch64Subtarget *ST) {
18159 if (!ST->isNeonAvailable())
18160 return SDValue();
18161
18162 if (!ST->hasDotProd())
18164
18165 SDValue Op0 = N->getOperand(0);
18166 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18167 Op0.getValueType().getVectorElementType() != MVT::i32)
18168 return SDValue();
18169
18170 unsigned ExtOpcode = Op0.getOpcode();
18171 SDValue A = Op0;
18172 SDValue B;
18173 unsigned DotOpcode;
18174 if (ExtOpcode == ISD::MUL) {
18175 A = Op0.getOperand(0);
18176 B = Op0.getOperand(1);
18177 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18178 return SDValue();
18179 auto OpCodeA = A.getOpcode();
18180 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18181 return SDValue();
18182
18183 auto OpCodeB = B.getOpcode();
18184 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18185 return SDValue();
18186
18187 if (OpCodeA == OpCodeB) {
18188 DotOpcode =
18190 } else {
18191 // Check USDOT support support
18192 if (!ST->hasMatMulInt8())
18193 return SDValue();
18194 DotOpcode = AArch64ISD::USDOT;
18195 if (OpCodeA == ISD::SIGN_EXTEND)
18196 std::swap(A, B);
18197 }
18198 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18199 DotOpcode = AArch64ISD::UDOT;
18200 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18201 DotOpcode = AArch64ISD::SDOT;
18202 } else {
18203 return SDValue();
18204 }
18205
18206 EVT Op0VT = A.getOperand(0).getValueType();
18207 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18208 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18209 if (!IsValidElementCount || !IsValidSize)
18210 return SDValue();
18211
18212 SDLoc DL(Op0);
18213 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18214 // the extend B.
18215 if (!B)
18216 B = DAG.getConstant(1, DL, Op0VT);
18217 else
18218 B = B.getOperand(0);
18219
18220 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18221 unsigned NumOfVecReduce;
18222 EVT TargetType;
18223 if (IsMultipleOf16) {
18224 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18225 TargetType = MVT::v4i32;
18226 } else {
18227 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18228 TargetType = MVT::v2i32;
18229 }
18230 // Handle the case where we need to generate only one Dot operation.
18231 if (NumOfVecReduce == 1) {
18232 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18233 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18234 A.getOperand(0), B);
18235 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18236 }
18237 // Generate Dot instructions that are multiple of 16.
18238 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18239 SmallVector<SDValue, 4> SDotVec16;
18240 unsigned I = 0;
18241 for (; I < VecReduce16Num; I += 1) {
18242 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18243 SDValue Op0 =
18244 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18245 DAG.getConstant(I * 16, DL, MVT::i64));
18246 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18247 DAG.getConstant(I * 16, DL, MVT::i64));
18248 SDValue Dot =
18249 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18250 SDotVec16.push_back(Dot);
18251 }
18252 // Concatenate dot operations.
18253 EVT SDot16EVT =
18254 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18255 SDValue ConcatSDot16 =
18256 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18257 SDValue VecReduceAdd16 =
18258 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18259 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18260 if (VecReduce8Num == 0)
18261 return VecReduceAdd16;
18262
18263 // Generate the remainder Dot operation that is multiple of 8.
18264 SmallVector<SDValue, 4> SDotVec8;
18265 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18266 SDValue Vec8Op0 =
18267 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18268 DAG.getConstant(I * 16, DL, MVT::i64));
18269 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18270 DAG.getConstant(I * 16, DL, MVT::i64));
18271 SDValue Dot =
18272 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18273 SDValue VecReudceAdd8 =
18274 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18275 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18276 VecReudceAdd8);
18277}
18278
18279// Given an (integer) vecreduce, we know the order of the inputs does not
18280// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18281// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18282// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18284 auto DetectAddExtract = [&](SDValue A) {
18285 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18286 // UADDLP(x) if found.
18287 assert(A.getOpcode() == ISD::ADD);
18288 EVT VT = A.getValueType();
18289 SDValue Op0 = A.getOperand(0);
18290 SDValue Op1 = A.getOperand(1);
18291 if (Op0.getOpcode() != Op1.getOpcode() ||
18292 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
18293 Op0.getOpcode() != ISD::SIGN_EXTEND))
18294 return SDValue();
18295 SDValue Ext0 = Op0.getOperand(0);
18296 SDValue Ext1 = Op1.getOperand(0);
18297 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18299 Ext0.getOperand(0) != Ext1.getOperand(0))
18300 return SDValue();
18301 // Check that the type is twice the add types, and the extract are from
18302 // upper/lower parts of the same source.
18304 VT.getVectorNumElements() * 2)
18305 return SDValue();
18306 if ((Ext0.getConstantOperandVal(1) != 0 ||
18308 (Ext1.getConstantOperandVal(1) != 0 ||
18310 return SDValue();
18311 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18313 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
18314 };
18315
18316 if (SDValue R = DetectAddExtract(A))
18317 return R;
18318
18319 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
18320 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
18321 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18322 A.getOperand(1));
18323 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
18324 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
18325 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
18326 A.getOperand(0));
18327 return SDValue();
18328}
18329
18330// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18331// UADDLV(concat), where the concat represents the 64-bit zext sources.
18333 // Look for add(zext(64-bit source), zext(64-bit source)), returning
18334 // UADDLV(concat(zext, zext)) if found.
18335 assert(A.getOpcode() == ISD::ADD);
18336 EVT VT = A.getValueType();
18337 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18338 return SDValue();
18339 SDValue Op0 = A.getOperand(0);
18340 SDValue Op1 = A.getOperand(1);
18341 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
18342 return SDValue();
18343 SDValue Ext0 = Op0.getOperand(0);
18344 SDValue Ext1 = Op1.getOperand(0);
18345 EVT ExtVT0 = Ext0.getValueType();
18346 EVT ExtVT1 = Ext1.getValueType();
18347 // Check zext VTs are the same and 64-bit length.
18348 if (ExtVT0 != ExtVT1 ||
18349 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
18350 return SDValue();
18351 // Get VT for concat of zext sources.
18352 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
18353 SDValue Concat =
18354 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
18355
18356 switch (VT.getSimpleVT().SimpleTy) {
18357 case MVT::v2i64:
18358 case MVT::v4i32:
18359 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
18360 case MVT::v8i16: {
18361 SDValue Uaddlv =
18362 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
18363 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
18364 }
18365 default:
18366 llvm_unreachable("Unhandled vector type");
18367 }
18368}
18369
18371 SDValue A = N->getOperand(0);
18372 if (A.getOpcode() == ISD::ADD) {
18373 if (SDValue R = performUADDVAddCombine(A, DAG))
18374 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
18375 else if (SDValue R = performUADDVZextCombine(A, DAG))
18376 return R;
18377 }
18378 return SDValue();
18379}
18380
18383 const AArch64Subtarget *Subtarget) {
18384 if (DCI.isBeforeLegalizeOps())
18385 return SDValue();
18386
18387 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18388}
18389
18390SDValue
18391AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
18392 SelectionDAG &DAG,
18393 SmallVectorImpl<SDNode *> &Created) const {
18395 if (isIntDivCheap(N->getValueType(0), Attr))
18396 return SDValue(N, 0); // Lower SDIV as SDIV
18397
18398 EVT VT = N->getValueType(0);
18399
18400 // For scalable and fixed types, mark them as cheap so we can handle it much
18401 // later. This allows us to handle larger than legal types.
18402 if (VT.isScalableVector() ||
18403 (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
18404 return SDValue(N, 0);
18405
18406 // fold (sdiv X, pow2)
18407 if ((VT != MVT::i32 && VT != MVT::i64) ||
18408 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18409 return SDValue();
18410
18411 // If the divisor is 2 or -2, the default expansion is better. It will add
18412 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18413 if (Divisor == 2 ||
18414 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
18415 return SDValue();
18416
18417 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18418}
18419
18420SDValue
18421AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
18422 SelectionDAG &DAG,
18423 SmallVectorImpl<SDNode *> &Created) const {
18425 if (isIntDivCheap(N->getValueType(0), Attr))
18426 return SDValue(N, 0); // Lower SREM as SREM
18427
18428 EVT VT = N->getValueType(0);
18429
18430 // For scalable and fixed types, mark them as cheap so we can handle it much
18431 // later. This allows us to handle larger than legal types.
18432 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
18433 return SDValue(N, 0);
18434
18435 // fold (srem X, pow2)
18436 if ((VT != MVT::i32 && VT != MVT::i64) ||
18437 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
18438 return SDValue();
18439
18440 unsigned Lg2 = Divisor.countr_zero();
18441 if (Lg2 == 0)
18442 return SDValue();
18443
18444 SDLoc DL(N);
18445 SDValue N0 = N->getOperand(0);
18446 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
18447 SDValue Zero = DAG.getConstant(0, DL, VT);
18448 SDValue CCVal, CSNeg;
18449 if (Lg2 == 1) {
18450 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
18451 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18452 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
18453
18454 Created.push_back(Cmp.getNode());
18455 Created.push_back(And.getNode());
18456 } else {
18457 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
18458 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18459
18460 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18461 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18462 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18463 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18464 Negs.getValue(1));
18465
18466 Created.push_back(Negs.getNode());
18467 Created.push_back(AndPos.getNode());
18468 Created.push_back(AndNeg.getNode());
18469 }
18470
18471 return CSNeg;
18472}
18473
18474static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18475 switch(getIntrinsicID(S.getNode())) {
18476 default:
18477 break;
18478 case Intrinsic::aarch64_sve_cntb:
18479 return 8;
18480 case Intrinsic::aarch64_sve_cnth:
18481 return 16;
18482 case Intrinsic::aarch64_sve_cntw:
18483 return 32;
18484 case Intrinsic::aarch64_sve_cntd:
18485 return 64;
18486 }
18487 return {};
18488}
18489
18490/// Calculates what the pre-extend type is, based on the extension
18491/// operation node provided by \p Extend.
18492///
18493/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18494/// pre-extend type is pulled directly from the operand, while other extend
18495/// operations need a bit more inspection to get this information.
18496///
18497/// \param Extend The SDNode from the DAG that represents the extend operation
18498///
18499/// \returns The type representing the \p Extend source type, or \p MVT::Other
18500/// if no valid type can be determined
18502 switch (Extend.getOpcode()) {
18503 case ISD::SIGN_EXTEND:
18504 case ISD::ZERO_EXTEND:
18505 case ISD::ANY_EXTEND:
18506 return Extend.getOperand(0).getValueType();
18507 case ISD::AssertSext:
18508 case ISD::AssertZext:
18510 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18511 if (!TypeNode)
18512 return MVT::Other;
18513 return TypeNode->getVT();
18514 }
18515 case ISD::AND: {
18517 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18518 if (!Constant)
18519 return MVT::Other;
18520
18521 uint32_t Mask = Constant->getZExtValue();
18522
18523 if (Mask == UCHAR_MAX)
18524 return MVT::i8;
18525 else if (Mask == USHRT_MAX)
18526 return MVT::i16;
18527 else if (Mask == UINT_MAX)
18528 return MVT::i32;
18529
18530 return MVT::Other;
18531 }
18532 default:
18533 return MVT::Other;
18534 }
18535}
18536
18537/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18538/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18539/// SExt/ZExt rather than the scalar SExt/ZExt
18541 EVT VT = BV.getValueType();
18542 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18544 return SDValue();
18545
18546 // Use the first item in the buildvector/shuffle to get the size of the
18547 // extend, and make sure it looks valid.
18548 SDValue Extend = BV->getOperand(0);
18549 unsigned ExtendOpcode = Extend.getOpcode();
18550 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18551 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18552 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18553 ExtendOpcode == ISD::AssertSext;
18554 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18555 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18556 return SDValue();
18557 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18558 // ensure calculatePreExtendType will work without issue.
18559 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18560 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18561 return SDValue();
18562
18563 // Restrict valid pre-extend data type
18564 EVT PreExtendType = calculatePreExtendType(Extend);
18565 if (PreExtendType == MVT::Other ||
18566 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18567 return SDValue();
18568
18569 // Make sure all other operands are equally extended.
18570 bool SeenZExtOrSExt = !IsAnyExt;
18571 for (SDValue Op : drop_begin(BV->ops())) {
18572 if (Op.isUndef())
18573 continue;
18574
18575 if (calculatePreExtendType(Op) != PreExtendType)
18576 return SDValue();
18577
18578 unsigned Opc = Op.getOpcode();
18579 if (Opc == ISD::ANY_EXTEND)
18580 continue;
18581
18582 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18583 Opc == ISD::AssertSext;
18584
18585 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18586 return SDValue();
18587
18588 IsSExt = OpcIsSExt;
18589 SeenZExtOrSExt = true;
18590 }
18591
18592 SDValue NBV;
18593 SDLoc DL(BV);
18594 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18595 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
18596 EVT PreExtendLegalType =
18597 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18599 for (SDValue Op : BV->ops())
18600 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
18601 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
18602 PreExtendLegalType));
18603 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
18604 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18605 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
18606 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
18607 BV.getOperand(1).isUndef()
18608 ? DAG.getUNDEF(PreExtendVT)
18609 : BV.getOperand(1).getOperand(0),
18610 cast<ShuffleVectorSDNode>(BV)->getMask());
18611 }
18612 unsigned ExtOpc = !SeenZExtOrSExt
18614 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18615 return DAG.getNode(ExtOpc, DL, VT, NBV);
18616}
18617
18618/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18619/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18621 // If the value type isn't a vector, none of the operands are going to be dups
18622 EVT VT = Mul->getValueType(0);
18623 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18624 return SDValue();
18625
18626 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
18627 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
18628
18629 // Neither operands have been changed, don't make any further changes
18630 if (!Op0 && !Op1)
18631 return SDValue();
18632
18633 SDLoc DL(Mul);
18634 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
18635 Op1 ? Op1 : Mul->getOperand(1));
18636}
18637
18638// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18639// Same for other types with equivalent constants.
18641 EVT VT = N->getValueType(0);
18642 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18643 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18644 return SDValue();
18645 if (N->getOperand(0).getOpcode() != ISD::AND ||
18646 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
18647 return SDValue();
18648
18649 SDValue And = N->getOperand(0);
18650 SDValue Srl = And.getOperand(0);
18651
18652 APInt V1, V2, V3;
18653 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
18654 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
18656 return SDValue();
18657
18658 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18659 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18660 V3 != (HalfSize - 1))
18661 return SDValue();
18662
18663 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18664 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
18665 VT.getVectorElementCount() * 2);
18666
18667 SDLoc DL(N);
18668 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
18669 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
18670 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
18671}
18672
18673// Transform vector add(zext i8 to i32, zext i8 to i32)
18674// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18675// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18676// extends.
18678 EVT VT = N->getValueType(0);
18679 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18680 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18681 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18682 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18683 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18684 N->getOperand(0).getOperand(0).getValueType() !=
18685 N->getOperand(1).getOperand(0).getValueType())
18686 return SDValue();
18687
18688 if (N->getOpcode() == ISD::MUL &&
18689 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
18690 return SDValue();
18691
18692 SDValue N0 = N->getOperand(0).getOperand(0);
18693 SDValue N1 = N->getOperand(1).getOperand(0);
18694 EVT InVT = N0.getValueType();
18695
18696 EVT S1 = InVT.getScalarType();
18697 EVT S2 = VT.getScalarType();
18698 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18699 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18700 SDLoc DL(N);
18701 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18704 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
18705 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
18706 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
18707 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
18708 : (unsigned)ISD::SIGN_EXTEND,
18709 DL, VT, NewOp);
18710 }
18711 return SDValue();
18712}
18713
18716 const AArch64Subtarget *Subtarget) {
18717
18718 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
18719 return Ext;
18721 return Ext;
18722 if (SDValue Ext = performVectorExtCombine(N, DAG))
18723 return Ext;
18724
18725 if (DCI.isBeforeLegalizeOps())
18726 return SDValue();
18727
18728 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18729 // and in MachineCombiner pass, add+mul will be combined into madd.
18730 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18731 SDLoc DL(N);
18732 EVT VT = N->getValueType(0);
18733 SDValue N0 = N->getOperand(0);
18734 SDValue N1 = N->getOperand(1);
18735 SDValue MulOper;
18736 unsigned AddSubOpc;
18737
18738 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18739 AddSubOpc = V->getOpcode();
18740 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18741 SDValue Opnd = V->getOperand(1);
18742 MulOper = V->getOperand(0);
18743 if (AddSubOpc == ISD::SUB)
18744 std::swap(Opnd, MulOper);
18745 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18746 return C->isOne();
18747 }
18748 return false;
18749 };
18750
18751 if (IsAddSubWith1(N0)) {
18752 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18753 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18754 }
18755
18756 if (IsAddSubWith1(N1)) {
18757 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18758 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18759 }
18760
18761 // The below optimizations require a constant RHS.
18762 if (!isa<ConstantSDNode>(N1))
18763 return SDValue();
18764
18765 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18766 const APInt &ConstValue = C->getAPIntValue();
18767
18768 // Allow the scaling to be folded into the `cnt` instruction by preventing
18769 // the scaling to be obscured here. This makes it easier to pattern match.
18770 if (IsSVECntIntrinsic(N0) ||
18771 (N0->getOpcode() == ISD::TRUNCATE &&
18772 (IsSVECntIntrinsic(N0->getOperand(0)))))
18773 if (ConstValue.sge(1) && ConstValue.sle(16))
18774 return SDValue();
18775
18776 // Multiplication of a power of two plus/minus one can be done more
18777 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18778 // future CPUs have a cheaper MADD instruction, this may need to be
18779 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18780 // 64-bit is 5 cycles, so this is always a win.
18781 // More aggressively, some multiplications N0 * C can be lowered to
18782 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18783 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18784 // TODO: lower more cases.
18785
18786 // TrailingZeroes is used to test if the mul can be lowered to
18787 // shift+add+shift.
18788 unsigned TrailingZeroes = ConstValue.countr_zero();
18789 if (TrailingZeroes) {
18790 // Conservatively do not lower to shift+add+shift if the mul might be
18791 // folded into smul or umul.
18792 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18793 isZeroExtended(N0, DAG)))
18794 return SDValue();
18795 // Conservatively do not lower to shift+add+shift if the mul might be
18796 // folded into madd or msub.
18797 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
18798 N->user_begin()->getOpcode() == ISD::SUB))
18799 return SDValue();
18800 }
18801 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18802 // and shift+add+shift.
18803 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18804 unsigned ShiftAmt;
18805
18806 auto Shl = [&](SDValue N0, unsigned N1) {
18807 if (!N0.getNode())
18808 return SDValue();
18809 // If shift causes overflow, ignore this combine.
18810 if (N1 >= N0.getValueSizeInBits())
18811 return SDValue();
18812 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18813 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18814 };
18815 auto Add = [&](SDValue N0, SDValue N1) {
18816 if (!N0.getNode() || !N1.getNode())
18817 return SDValue();
18818 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
18819 };
18820 auto Sub = [&](SDValue N0, SDValue N1) {
18821 if (!N0.getNode() || !N1.getNode())
18822 return SDValue();
18823 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
18824 };
18825 auto Negate = [&](SDValue N) {
18826 if (!N0.getNode())
18827 return SDValue();
18828 SDValue Zero = DAG.getConstant(0, DL, VT);
18829 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
18830 };
18831
18832 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18833 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18834 // the (2^N - 1) can't be execused via a single instruction.
18835 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18836 unsigned BitWidth = C.getBitWidth();
18837 for (unsigned i = 1; i < BitWidth / 2; i++) {
18838 APInt Rem;
18839 APInt X(BitWidth, (1 << i) + 1);
18840 APInt::sdivrem(C, X, N, Rem);
18841 APInt NVMinus1 = N - 1;
18842 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18843 M = X;
18844 return true;
18845 }
18846 }
18847 return false;
18848 };
18849
18850 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18851 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18852 // the (2^N - 1) can't be execused via a single instruction.
18853 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18854 APInt CVMinus1 = C - 1;
18855 if (CVMinus1.isNegative())
18856 return false;
18857 unsigned TrailingZeroes = CVMinus1.countr_zero();
18858 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
18859 if (SCVMinus1.isPowerOf2()) {
18860 unsigned BitWidth = SCVMinus1.getBitWidth();
18861 M = APInt(BitWidth, SCVMinus1.logBase2());
18862 N = APInt(BitWidth, TrailingZeroes);
18863 return true;
18864 }
18865 return false;
18866 };
18867
18868 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18869 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18870 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18871 APInt CVMinus1 = C - 1;
18872 if (CVMinus1.isNegative())
18873 return false;
18874 unsigned TrailingZeroes = CVMinus1.countr_zero();
18875 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
18876 if (CVPlus1.isPowerOf2()) {
18877 unsigned BitWidth = CVPlus1.getBitWidth();
18878 M = APInt(BitWidth, CVPlus1.logBase2());
18879 N = APInt(BitWidth, TrailingZeroes);
18880 return true;
18881 }
18882 return false;
18883 };
18884
18885 if (ConstValue.isNonNegative()) {
18886 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18887 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18888 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18889 // (mul x, (2^M + 1) * (2^N + 1))
18890 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18891 // (mul x, (2^M + 1) * 2^N + 1))
18892 // => MV = add (shl x, M), x); add (shl MV, N), x)
18893 // (mul x, 1 - (1 - 2^M) * 2^N))
18894 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18895 APInt SCVMinus1 = ShiftedConstValue - 1;
18896 APInt SCVPlus1 = ShiftedConstValue + 1;
18897 APInt CVPlus1 = ConstValue + 1;
18898 APInt CVM, CVN;
18899 if (SCVMinus1.isPowerOf2()) {
18900 ShiftAmt = SCVMinus1.logBase2();
18901 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18902 } else if (CVPlus1.isPowerOf2()) {
18903 ShiftAmt = CVPlus1.logBase2();
18904 return Sub(Shl(N0, ShiftAmt), N0);
18905 } else if (SCVPlus1.isPowerOf2()) {
18906 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18907 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18908 }
18909 if (Subtarget->hasALULSLFast() &&
18910 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18911 APInt CVMMinus1 = CVM - 1;
18912 APInt CVNMinus1 = CVN - 1;
18913 unsigned ShiftM1 = CVMMinus1.logBase2();
18914 unsigned ShiftN1 = CVNMinus1.logBase2();
18915 // ALULSLFast implicate that Shifts <= 4 places are fast
18916 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18917 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18918 return Add(Shl(MVal, ShiftN1), MVal);
18919 }
18920 }
18921 if (Subtarget->hasALULSLFast() &&
18922 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18923 unsigned ShiftM = CVM.getZExtValue();
18924 unsigned ShiftN = CVN.getZExtValue();
18925 // ALULSLFast implicate that Shifts <= 4 places are fast
18926 if (ShiftM <= 4 && ShiftN <= 4) {
18927 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18928 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18929 }
18930 }
18931
18932 if (Subtarget->hasALULSLFast() &&
18933 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18934 unsigned ShiftM = CVM.getZExtValue();
18935 unsigned ShiftN = CVN.getZExtValue();
18936 // ALULSLFast implicate that Shifts <= 4 places are fast
18937 if (ShiftM <= 4 && ShiftN <= 4) {
18938 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18939 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18940 }
18941 }
18942 } else {
18943 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18944 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18945 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18946 APInt SCVPlus1 = -ShiftedConstValue + 1;
18947 APInt CVNegPlus1 = -ConstValue + 1;
18948 APInt CVNegMinus1 = -ConstValue - 1;
18949 if (CVNegPlus1.isPowerOf2()) {
18950 ShiftAmt = CVNegPlus1.logBase2();
18951 return Sub(N0, Shl(N0, ShiftAmt));
18952 } else if (CVNegMinus1.isPowerOf2()) {
18953 ShiftAmt = CVNegMinus1.logBase2();
18954 return Negate(Add(Shl(N0, ShiftAmt), N0));
18955 } else if (SCVPlus1.isPowerOf2()) {
18956 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18957 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18958 }
18959 }
18960
18961 return SDValue();
18962}
18963
18965 SelectionDAG &DAG) {
18966 // Take advantage of vector comparisons producing 0 or -1 in each lane to
18967 // optimize away operation when it's from a constant.
18968 //
18969 // The general transformation is:
18970 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18971 // AND(VECTOR_CMP(x,y), constant2)
18972 // constant2 = UNARYOP(constant)
18973
18974 // Early exit if this isn't a vector operation, the operand of the
18975 // unary operation isn't a bitwise AND, or if the sizes of the operations
18976 // aren't the same.
18977 EVT VT = N->getValueType(0);
18978 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
18979 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
18980 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
18981 return SDValue();
18982
18983 // Now check that the other operand of the AND is a constant. We could
18984 // make the transformation for non-constant splats as well, but it's unclear
18985 // that would be a benefit as it would not eliminate any operations, just
18986 // perform one more step in scalar code before moving to the vector unit.
18987 if (BuildVectorSDNode *BV =
18988 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
18989 // Bail out if the vector isn't a constant.
18990 if (!BV->isConstant())
18991 return SDValue();
18992
18993 // Everything checks out. Build up the new and improved node.
18994 SDLoc DL(N);
18995 EVT IntVT = BV->getValueType(0);
18996 // Create a new constant of the appropriate type for the transformed
18997 // DAG.
18998 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
18999 // The AND node needs bitcasts to/from an integer vector type around it.
19000 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19001 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19002 N->getOperand(0)->getOperand(0), MaskConst);
19003 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19004 return Res;
19005 }
19006
19007 return SDValue();
19008}
19009
19010/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19011/// functions, this can help to reduce the number of fmovs to/from GPRs.
19012static SDValue
19015 const AArch64Subtarget *Subtarget) {
19016 if (N->isStrictFPOpcode())
19017 return SDValue();
19018
19019 if (DCI.isBeforeLegalizeOps())
19020 return SDValue();
19021
19022 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19023 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19024 return SDValue();
19025
19026 auto isSupportedType = [](EVT VT) {
19027 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19028 };
19029
19030 SDValue SrcVal = N->getOperand(0);
19031 EVT SrcTy = SrcVal.getValueType();
19032 EVT DestTy = N->getValueType(0);
19033
19034 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19035 return SDValue();
19036
19037 EVT SrcVecTy;
19038 EVT DestVecTy;
19039 if (DestTy.bitsGT(SrcTy)) {
19040 DestVecTy = getPackedSVEVectorVT(DestTy);
19041 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19042 } else {
19043 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19044 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19045 }
19046
19047 // Ensure the resulting src/dest vector type is legal.
19048 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19049 return SDValue();
19050
19051 SDLoc DL(N);
19052 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19053 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19054 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19055 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19057}
19058
19061 const AArch64Subtarget *Subtarget) {
19062 // First try to optimize away the conversion when it's conditionally from
19063 // a constant. Vectors only.
19065 return Res;
19066
19067 if (SDValue Res =
19068 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19069 return Res;
19070
19071 EVT VT = N->getValueType(0);
19072 if (VT != MVT::f32 && VT != MVT::f64)
19073 return SDValue();
19074
19075 // Only optimize when the source and destination types have the same width.
19076 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19077 return SDValue();
19078
19079 // If the result of an integer load is only used by an integer-to-float
19080 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19081 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19082 SDValue N0 = N->getOperand(0);
19083 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19084 N0.hasOneUse() &&
19085 // Do not change the width of a volatile load.
19086 !cast<LoadSDNode>(N0)->isVolatile()) {
19087 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19088 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19089 LN0->getPointerInfo(), LN0->getAlign(),
19090 LN0->getMemOperand()->getFlags());
19091
19092 // Make sure successors of the original load stay after it by updating them
19093 // to use the new Chain.
19094 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19095
19096 unsigned Opcode =
19098 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19099 }
19100
19101 return SDValue();
19102}
19103
19104/// Fold a floating-point multiply by power of two into floating-point to
19105/// fixed-point conversion.
19108 const AArch64Subtarget *Subtarget) {
19109 if (SDValue Res =
19110 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19111 return Res;
19112
19113 if (!Subtarget->isNeonAvailable())
19114 return SDValue();
19115
19116 if (!N->getValueType(0).isSimple())
19117 return SDValue();
19118
19119 SDValue Op = N->getOperand(0);
19120 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19121 return SDValue();
19122
19123 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19124 return SDValue();
19125
19126 SDValue ConstVec = Op->getOperand(1);
19127 if (!isa<BuildVectorSDNode>(ConstVec))
19128 return SDValue();
19129
19130 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19131 uint32_t FloatBits = FloatTy.getSizeInBits();
19132 if (FloatBits != 32 && FloatBits != 64 &&
19133 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19134 return SDValue();
19135
19136 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19137 uint32_t IntBits = IntTy.getSizeInBits();
19138 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19139 return SDValue();
19140
19141 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19142 if (IntBits > FloatBits)
19143 return SDValue();
19144
19145 BitVector UndefElements;
19146 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
19147 int32_t Bits = IntBits == 64 ? 64 : 32;
19148 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19149 if (C == -1 || C == 0 || C > Bits)
19150 return SDValue();
19151
19152 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19153 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19154 return SDValue();
19155
19156 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19157 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19158 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19159 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19160 return SDValue();
19161 }
19162
19163 SDLoc DL(N);
19164 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19165 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19166 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19167 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19168 SDValue FixConv =
19170 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19171 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19172 // We can handle smaller integers by generating an extra trunc.
19173 if (IntBits < FloatBits)
19174 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19175
19176 return FixConv;
19177}
19178
19180 const AArch64TargetLowering &TLI) {
19181 EVT VT = N->getValueType(0);
19182 SelectionDAG &DAG = DCI.DAG;
19183 SDLoc DL(N);
19184 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19185
19186 if (!VT.isVector())
19187 return SDValue();
19188
19189 if (VT.isScalableVector() && !Subtarget.hasSVE2())
19190 return SDValue();
19191
19192 if (VT.isFixedLengthVector() &&
19193 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
19194 return SDValue();
19195
19196 SDValue N0 = N->getOperand(0);
19197 if (N0.getOpcode() != ISD::AND)
19198 return SDValue();
19199
19200 SDValue N1 = N->getOperand(1);
19201 if (N1.getOpcode() != ISD::AND)
19202 return SDValue();
19203
19204 // InstCombine does (not (neg a)) => (add a -1).
19205 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19206 // Loop over all combinations of AND operands.
19207 for (int i = 1; i >= 0; --i) {
19208 for (int j = 1; j >= 0; --j) {
19209 SDValue O0 = N0->getOperand(i);
19210 SDValue O1 = N1->getOperand(j);
19211 SDValue Sub, Add, SubSibling, AddSibling;
19212
19213 // Find a SUB and an ADD operand, one from each AND.
19214 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19215 Sub = O0;
19216 Add = O1;
19217 SubSibling = N0->getOperand(1 - i);
19218 AddSibling = N1->getOperand(1 - j);
19219 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19220 Add = O0;
19221 Sub = O1;
19222 AddSibling = N0->getOperand(1 - i);
19223 SubSibling = N1->getOperand(1 - j);
19224 } else
19225 continue;
19226
19228 continue;
19229
19230 // Constant ones is always righthand operand of the Add.
19231 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
19232 continue;
19233
19234 if (Sub.getOperand(1) != Add.getOperand(0))
19235 continue;
19236
19237 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
19238 }
19239 }
19240
19241 // (or (and a b) (and (not a) c)) => (bsl a b c)
19242 // We only have to look for constant vectors here since the general, variable
19243 // case can be handled in TableGen.
19244 unsigned Bits = VT.getScalarSizeInBits();
19245 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
19246 for (int i = 1; i >= 0; --i)
19247 for (int j = 1; j >= 0; --j) {
19248 APInt Val1, Val2;
19249
19250 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
19252 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19253 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19254 N0->getOperand(1 - i), N1->getOperand(1 - j));
19255 }
19256 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
19257 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
19258 if (!BVN0 || !BVN1)
19259 continue;
19260
19261 bool FoundMatch = true;
19262 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
19263 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
19264 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
19265 if (!CN0 || !CN1 ||
19266 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19267 FoundMatch = false;
19268 break;
19269 }
19270 }
19271 if (FoundMatch)
19272 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
19273 N0->getOperand(1 - i), N1->getOperand(1 - j));
19274 }
19275
19276 return SDValue();
19277}
19278
19279// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19280// convert to csel(ccmp(.., cc0)), depending on cc1:
19281
19282// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19283// =>
19284// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19285//
19286// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19287// =>
19288// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19290 EVT VT = N->getValueType(0);
19291 SDValue CSel0 = N->getOperand(0);
19292 SDValue CSel1 = N->getOperand(1);
19293
19294 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19295 CSel1.getOpcode() != AArch64ISD::CSEL)
19296 return SDValue();
19297
19298 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19299 return SDValue();
19300
19301 if (!isNullConstant(CSel0.getOperand(0)) ||
19302 !isOneConstant(CSel0.getOperand(1)) ||
19303 !isNullConstant(CSel1.getOperand(0)) ||
19304 !isOneConstant(CSel1.getOperand(1)))
19305 return SDValue();
19306
19307 SDValue Cmp0 = CSel0.getOperand(3);
19308 SDValue Cmp1 = CSel1.getOperand(3);
19311 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19312 return SDValue();
19313 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19314 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19315 std::swap(Cmp0, Cmp1);
19316 std::swap(CC0, CC1);
19317 }
19318
19319 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19320 return SDValue();
19321
19322 SDLoc DL(N);
19323 SDValue CCmp, Condition;
19324 unsigned NZCV;
19325
19326 if (N->getOpcode() == ISD::AND) {
19328 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
19330 } else {
19332 Condition = DAG.getConstant(CC0, DL, MVT_CC);
19334 }
19335
19336 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19337
19338 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19339 if (Op1 && Op1->getAPIntValue().isNegative() &&
19340 Op1->getAPIntValue().sgt(-32)) {
19341 // CCMP accept the constant int the range [0, 31]
19342 // if the Op1 is a constant in the range [-31, -1], we
19343 // can select to CCMN to avoid the extra mov
19344 SDValue AbsOp1 =
19345 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19346 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
19347 NZCVOp, Condition, Cmp0);
19348 } else {
19349 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
19350 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19351 }
19352 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19353 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
19354 CCmp);
19355}
19356
19358 const AArch64Subtarget *Subtarget,
19359 const AArch64TargetLowering &TLI) {
19360 SelectionDAG &DAG = DCI.DAG;
19361 EVT VT = N->getValueType(0);
19362
19363 if (SDValue R = performANDORCSELCombine(N, DAG))
19364 return R;
19365
19366 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19367 return SDValue();
19368
19369 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19370 return Res;
19371
19372 return SDValue();
19373}
19374
19376 if (!MemVT.getVectorElementType().isSimple())
19377 return false;
19378
19379 uint64_t MaskForTy = 0ull;
19380 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19381 case MVT::i8:
19382 MaskForTy = 0xffull;
19383 break;
19384 case MVT::i16:
19385 MaskForTy = 0xffffull;
19386 break;
19387 case MVT::i32:
19388 MaskForTy = 0xffffffffull;
19389 break;
19390 default:
19391 return false;
19392 break;
19393 }
19394
19395 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
19396 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
19397 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19398
19399 return false;
19400}
19401
19403 SDValue LeafOp = SDValue(N, 0);
19404 SDValue Op = N->getOperand(0);
19405 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19406 LeafOp.getValueType() != Op.getValueType())
19407 Op = Op->getOperand(0);
19408 if (LeafOp.getValueType() == Op.getValueType())
19409 return Op;
19410 return SDValue();
19411}
19412
19415 SelectionDAG &DAG = DCI.DAG;
19416 SDValue Src = N->getOperand(0);
19417 unsigned Opc = Src->getOpcode();
19418
19419 // Zero/any extend of an unsigned unpack
19420 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19421 SDValue UnpkOp = Src->getOperand(0);
19422 SDValue Dup = N->getOperand(1);
19423
19424 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19425 return SDValue();
19426
19427 SDLoc DL(N);
19428 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
19429 if (!C)
19430 return SDValue();
19431
19432 uint64_t ExtVal = C->getZExtValue();
19433
19434 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19435 return ((ExtVal == 0xFF && VT == MVT::i8) ||
19436 (ExtVal == 0xFFFF && VT == MVT::i16) ||
19437 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
19438 };
19439
19440 // If the mask is fully covered by the unpack, we don't need to push
19441 // a new AND onto the operand
19442 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
19443 if (MaskAndTypeMatch(EltTy))
19444 return Src;
19445
19446 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19447 // to see if the mask is all-ones of size MemTy.
19448 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
19449 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
19450 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19451 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19452 if (MaskAndTypeMatch(EltTy))
19453 return Src;
19454 }
19455
19456 // Truncate to prevent a DUP with an over wide constant
19457 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
19458
19459 // Otherwise, make sure we propagate the AND to the operand
19460 // of the unpack
19461 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
19462 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
19463
19464 SDValue And = DAG.getNode(ISD::AND, DL,
19465 UnpkOp->getValueType(0), UnpkOp, Dup);
19466
19467 return DAG.getNode(Opc, DL, N->getValueType(0), And);
19468 }
19469
19470 if (DCI.isBeforeLegalizeOps())
19471 return SDValue();
19472
19473 // If both sides of AND operations are i1 splat_vectors then
19474 // we can produce just i1 splat_vector as the result.
19475 if (isAllActivePredicate(DAG, N->getOperand(0)))
19476 return N->getOperand(1);
19477 if (isAllActivePredicate(DAG, N->getOperand(1)))
19478 return N->getOperand(0);
19479
19481 return SDValue();
19482
19483 SDValue Mask = N->getOperand(1);
19484
19485 if (!Src.hasOneUse())
19486 return SDValue();
19487
19488 EVT MemVT;
19489
19490 // SVE load instructions perform an implicit zero-extend, which makes them
19491 // perfect candidates for combining.
19492 switch (Opc) {
19496 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
19497 break;
19513 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
19514 break;
19515 default:
19516 return SDValue();
19517 }
19518
19519 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
19520 return Src;
19521
19522 return SDValue();
19523}
19524
19525// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19528
19529 // This function performs an optimization on a specific pattern involving
19530 // an AND operation and SETCC (Set Condition Code) node.
19531
19532 SDValue SetCC = N->getOperand(0);
19533 EVT VT = N->getValueType(0);
19534 SelectionDAG &DAG = DCI.DAG;
19535
19536 // Checks if the current node (N) is used by any SELECT instruction and
19537 // returns an empty SDValue to avoid applying the optimization to prevent
19538 // incorrect results
19539 for (auto U : N->users())
19540 if (U->getOpcode() == ISD::SELECT)
19541 return SDValue();
19542
19543 // Check if the operand is a SETCC node with floating-point comparison
19544 if (SetCC.getOpcode() == ISD::SETCC &&
19545 SetCC.getOperand(0).getValueType() == MVT::f32) {
19546
19547 SDValue Cmp;
19549
19550 // Check if the DAG is after legalization and if we can emit the conjunction
19551 if (!DCI.isBeforeLegalize() &&
19552 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19553
19555
19556 SDLoc DL(N);
19557 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19558 DAG.getConstant(0, DL, VT),
19559 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
19560 }
19561 }
19562 return SDValue();
19563}
19564
19567 SelectionDAG &DAG = DCI.DAG;
19568 SDValue LHS = N->getOperand(0);
19569 SDValue RHS = N->getOperand(1);
19570 EVT VT = N->getValueType(0);
19571
19572 if (SDValue R = performANDORCSELCombine(N, DAG))
19573 return R;
19574
19575 if (SDValue R = performANDSETCCCombine(N,DCI))
19576 return R;
19577
19578 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19579 return SDValue();
19580
19581 if (VT.isScalableVector())
19582 return performSVEAndCombine(N, DCI);
19583
19584 // The combining code below works only for NEON vectors. In particular, it
19585 // does not work for SVE when dealing with vectors wider than 128 bits.
19586 if (!VT.is64BitVector() && !VT.is128BitVector())
19587 return SDValue();
19588
19589 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19590 if (!BVN)
19591 return SDValue();
19592
19593 // AND does not accept an immediate, so check if we can use a BIC immediate
19594 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19595 // pattern in isel, because some immediates may be lowered to the preferred
19596 // (and x, (movi imm)) form, even though an mvni representation also exists.
19597 APInt DefBits(VT.getSizeInBits(), 0);
19598 APInt UndefBits(VT.getSizeInBits(), 0);
19599 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19600 SDValue NewOp;
19601
19602 // Any bits known to already be 0 need not be cleared again, which can help
19603 // reduce the size of the immediate to one supported by the instruction.
19604 KnownBits Known = DAG.computeKnownBits(LHS);
19605 APInt ZeroSplat(VT.getSizeInBits(), 0);
19606 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19607 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19608 << (Known.Zero.getBitWidth() * I);
19609
19610 DefBits = ~(DefBits | ZeroSplat);
19611 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19612 DefBits, &LHS)) ||
19613 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19614 DefBits, &LHS)))
19615 return NewOp;
19616
19617 UndefBits = ~(UndefBits | ZeroSplat);
19618 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19619 UndefBits, &LHS)) ||
19620 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19621 UndefBits, &LHS)))
19622 return NewOp;
19623 }
19624
19625 return SDValue();
19626}
19627
19630 SelectionDAG &DAG = DCI.DAG;
19631 SDValue LHS = N->getOperand(0);
19632 SDValue RHS = N->getOperand(1);
19633 EVT VT = N->getValueType(0);
19634 SDLoc DL(N);
19635
19636 if (!N->getFlags().hasAllowReassociation())
19637 return SDValue();
19638
19639 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19640 auto ReassocComplex = [&](SDValue A, SDValue B) {
19641 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19642 return SDValue();
19643 unsigned Opc = A.getConstantOperandVal(0);
19644 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19645 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19646 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19647 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19648 return SDValue();
19649 SDValue VCMLA = DAG.getNode(
19650 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19651 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19652 A.getOperand(2), A.getOperand(3));
19653 VCMLA->setFlags(A->getFlags());
19654 return VCMLA;
19655 };
19656 if (SDValue R = ReassocComplex(LHS, RHS))
19657 return R;
19658 if (SDValue R = ReassocComplex(RHS, LHS))
19659 return R;
19660
19661 return SDValue();
19662}
19663
19664static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19665 switch (Opcode) {
19666 case ISD::STRICT_FADD:
19667 case ISD::FADD:
19668 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19669 case ISD::ADD:
19670 return VT == MVT::i64;
19671 default:
19672 return false;
19673 }
19674}
19675
19676static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19678
19680 if ((N.getOpcode() == ISD::SETCC) ||
19681 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19682 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
19683 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
19684 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
19685 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
19686 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
19687 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
19688 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19689 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
19690 // get_active_lane_mask is lowered to a whilelo instruction.
19691 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19692 return true;
19693
19694 return false;
19695}
19696
19697// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19698// ... into: "ptrue p, all" + PTEST
19699static SDValue
19702 const AArch64Subtarget *Subtarget) {
19703 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19704 // Make sure PTEST can be legalised with illegal types.
19705 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19706 return SDValue();
19707
19708 SDValue N0 = N->getOperand(0);
19709 EVT VT = N0.getValueType();
19710
19711 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19712 !isNullConstant(N->getOperand(1)))
19713 return SDValue();
19714
19715 // Restricted the DAG combine to only cases where we're extracting from a
19716 // flag-setting operation.
19717 if (!isPredicateCCSettingOp(N0))
19718 return SDValue();
19719
19720 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19721 SelectionDAG &DAG = DCI.DAG;
19722 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
19723 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
19724}
19725
19726// Materialize : Idx = (add (mul vscale, NumEls), -1)
19727// i1 = extract_vector_elt t37, Constant:i64<Idx>
19728// ... into: "ptrue p, all" + PTEST
19729static SDValue
19732 const AArch64Subtarget *Subtarget) {
19733 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19734 // Make sure PTEST is legal types.
19735 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19736 return SDValue();
19737
19738 SDValue N0 = N->getOperand(0);
19739 EVT OpVT = N0.getValueType();
19740
19741 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19742 return SDValue();
19743
19744 // Idx == (add (mul vscale, NumEls), -1)
19745 SDValue Idx = N->getOperand(1);
19746 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
19747 return SDValue();
19748
19749 SDValue VS = Idx.getOperand(0);
19750 if (VS.getOpcode() != ISD::VSCALE)
19751 return SDValue();
19752
19753 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19754 if (VS.getConstantOperandVal(0) != NumEls)
19755 return SDValue();
19756
19757 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19758 SelectionDAG &DAG = DCI.DAG;
19759 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
19760 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
19761}
19762
19763static SDValue
19765 const AArch64Subtarget *Subtarget) {
19766 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19767 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19768 return Res;
19769 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19770 return Res;
19771
19772 SelectionDAG &DAG = DCI.DAG;
19773 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19774
19775 EVT VT = N->getValueType(0);
19776 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19777 bool IsStrict = N0->isStrictFPOpcode();
19778
19779 // extract(dup x) -> x
19780 if (N0.getOpcode() == AArch64ISD::DUP)
19781 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
19782 : N0.getOperand(0);
19783
19784 // Rewrite for pairwise fadd pattern
19785 // (f32 (extract_vector_elt
19786 // (fadd (vXf32 Other)
19787 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19788 // ->
19789 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19790 // (extract_vector_elt (vXf32 Other) 1))
19791 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19792 // we can only do this when it's used only by the extract_vector_elt.
19793 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
19794 (!IsStrict || N0.hasOneUse())) {
19795 SDLoc DL(N0);
19796 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
19797 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19798
19799 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19800 SDValue Other = N00;
19801
19802 // And handle the commutative case.
19803 if (!Shuffle) {
19804 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19805 Other = N01;
19806 }
19807
19808 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19809 Other == Shuffle->getOperand(0)) {
19810 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19811 DAG.getConstant(0, DL, MVT::i64));
19812 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19813 DAG.getConstant(1, DL, MVT::i64));
19814 if (!IsStrict)
19815 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19816
19817 // For strict_fadd we need uses of the final extract_vector to be replaced
19818 // with the strict_fadd, but we also need uses of the chain output of the
19819 // original strict_fadd to use the chain output of the new strict_fadd as
19820 // otherwise it may not be deleted.
19821 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
19822 {VT, MVT::Other},
19823 {N0->getOperand(0), Extract1, Extract2});
19824 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
19825 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
19826 return SDValue(N, 0);
19827 }
19828 }
19829
19830 return SDValue();
19831}
19832
19835 SelectionDAG &DAG) {
19836 SDLoc dl(N);
19837 EVT VT = N->getValueType(0);
19838 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19839 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19840
19841 if (VT.isScalableVector())
19842 return SDValue();
19843
19844 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19845 N1Opc == ISD::TRUNCATE) {
19846 SDValue N00 = N0->getOperand(0);
19847 SDValue N10 = N1->getOperand(0);
19848 EVT N00VT = N00.getValueType();
19849 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
19850
19851 // Optimize concat_vectors of truncated vectors, where the intermediate
19852 // type is illegal, to avoid said illegality, e.g.,
19853 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19854 // (v2i16 (truncate (v2i64)))))
19855 // ->
19856 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19857 // (v4i32 (bitcast (v2i64))),
19858 // <0, 2, 4, 6>)))
19859 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19860 // on both input and result type, so we might generate worse code.
19861 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19862 if (N00VT == N10.getValueType() &&
19863 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19864 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19865 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19867 for (size_t i = 0; i < Mask.size(); ++i)
19868 Mask[i] = i * 2;
19869 return DAG.getNode(ISD::TRUNCATE, dl, VT,
19870 DAG.getVectorShuffle(
19871 MidVT, dl,
19872 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
19873 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
19874 }
19875
19876 // Optimize two large shifts and a combine into a single combine and shift
19877 // For AArch64 architectures, sequences like the following:
19878 //
19879 // ushr v0.4s, v0.4s, #20
19880 // ushr v1.4s, v1.4s, #20
19881 // uzp1 v0.8h, v0.8h, v1.8h
19882 //
19883 // Can be optimized to:
19884 //
19885 // uzp2 v0.8h, v0.8h, v1.8h
19886 // ushr v0.8h, v0.8h, #4
19887 //
19888 // This optimization reduces instruction count.
19889 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
19890 N00->getOperand(1) == N10->getOperand(1)) {
19891 SDValue N000 = N00->getOperand(0);
19892 SDValue N100 = N10->getOperand(0);
19893 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
19894 N101ConstVal = N10->getConstantOperandVal(1),
19895 NScalarSize = N->getValueType(0).getScalarSizeInBits();
19896
19897 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
19898 N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000);
19899 N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100);
19900 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
19901 SDValue NewShiftConstant =
19902 DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
19903
19904 return DAG.getNode(AArch64ISD::VLSHR, dl, VT, Uzp, NewShiftConstant);
19905 }
19906 }
19907 }
19908
19909 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
19910 N->getOperand(0).getValueType() == MVT::v2i16 ||
19911 N->getOperand(0).getValueType() == MVT::v2i8) {
19912 EVT SrcVT = N->getOperand(0).getValueType();
19913 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19914 // loads to prevent having to go through the v4i8 load legalization that
19915 // needs to extend each element into a larger type.
19916 if (N->getNumOperands() % 2 == 0 &&
19917 all_of(N->op_values(), [SrcVT](SDValue V) {
19918 if (V.getValueType() != SrcVT)
19919 return false;
19920 if (V.isUndef())
19921 return true;
19922 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
19923 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19924 LD->getExtensionType() == ISD::NON_EXTLOAD;
19925 })) {
19926 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19927 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
19929
19930 for (unsigned i = 0; i < N->getNumOperands(); i++) {
19931 SDValue V = N->getOperand(i);
19932 if (V.isUndef())
19933 Ops.push_back(DAG.getUNDEF(FVT));
19934 else {
19935 LoadSDNode *LD = cast<LoadSDNode>(V);
19936 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
19937 LD->getBasePtr(), LD->getMemOperand());
19938 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
19939 Ops.push_back(NewLoad);
19940 }
19941 }
19942 return DAG.getBitcast(N->getValueType(0),
19943 DAG.getBuildVector(NVT, dl, Ops));
19944 }
19945 }
19946
19947 // Canonicalise concat_vectors to replace concatenations of truncated nots
19948 // with nots of concatenated truncates. This in some cases allows for multiple
19949 // redundant negations to be eliminated.
19950 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
19951 // (v4i16 (truncate (not (v4i32)))))
19952 // ->
19953 // (not (concat_vectors (v4i16 (truncate (v4i32))),
19954 // (v4i16 (truncate (v4i32)))))
19955 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19956 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
19957 N->isOnlyUserOf(N1.getNode())) {
19958 auto isBitwiseVectorNegate = [](SDValue V) {
19959 return V->getOpcode() == ISD::XOR &&
19960 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
19961 };
19962 SDValue N00 = N0->getOperand(0);
19963 SDValue N10 = N1->getOperand(0);
19964 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
19965 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
19966 return DAG.getNOT(
19967 dl,
19968 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19969 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
19970 N00->getOperand(0)),
19971 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
19972 N10->getOperand(0))),
19973 VT);
19974 }
19975 }
19976
19977 // Wait till after everything is legalized to try this. That way we have
19978 // legal vector types and such.
19979 if (DCI.isBeforeLegalizeOps())
19980 return SDValue();
19981
19982 // Optimise concat_vectors of two identical binops with a 128-bit destination
19983 // size, combine into an binop of two contacts of the source vectors. eg:
19984 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19985 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
19986 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
19987 N1->hasOneUse()) {
19988 SDValue N00 = N0->getOperand(0);
19989 SDValue N01 = N0->getOperand(1);
19990 SDValue N10 = N1->getOperand(0);
19991 SDValue N11 = N1->getOperand(1);
19992
19993 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19994 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
19995 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
19996 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
19997 }
19998 }
19999
20000 auto IsRSHRN = [](SDValue Shr) {
20001 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20002 return false;
20003 SDValue Op = Shr.getOperand(0);
20004 EVT VT = Op.getValueType();
20005 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20006 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20007 return false;
20008
20009 APInt Imm;
20010 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20011 Imm = APInt(VT.getScalarSizeInBits(),
20012 Op.getOperand(1).getConstantOperandVal(0)
20013 << Op.getOperand(1).getConstantOperandVal(1));
20014 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20015 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20016 Imm = APInt(VT.getScalarSizeInBits(),
20017 Op.getOperand(1).getConstantOperandVal(0));
20018 else
20019 return false;
20020
20021 if (Imm != 1ULL << (ShtAmt - 1))
20022 return false;
20023 return true;
20024 };
20025
20026 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20027 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20028 ((IsRSHRN(N1) &&
20030 N1.isUndef())) {
20031 SDValue X = N0.getOperand(0).getOperand(0);
20032 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20033 : N1.getOperand(0).getOperand(0);
20034 EVT BVT =
20035 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20036 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
20037 SDValue Add = DAG.getNode(
20038 ISD::ADD, dl, BVT, CC,
20039 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
20040 SDValue Shr =
20041 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
20042 return Shr;
20043 }
20044
20045 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20046 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20047 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20048 N0.getOperand(1) == N1.getOperand(1)) {
20049 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
20050 DAG.getUNDEF(N0.getValueType()));
20051 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
20052 DAG.getUNDEF(N0.getValueType()));
20053 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
20054 }
20055
20056 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20057 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20058 // canonicalise to that.
20059 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20060 assert(VT.getScalarSizeInBits() == 64);
20061 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
20062 DAG.getConstant(0, dl, MVT::i64));
20063 }
20064
20065 // Canonicalise concat_vectors so that the right-hand vector has as few
20066 // bit-casts as possible before its real operation. The primary matching
20067 // destination for these operations will be the narrowing "2" instructions,
20068 // which depend on the operation being performed on this right-hand vector.
20069 // For example,
20070 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20071 // becomes
20072 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20073
20074 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20075 return SDValue();
20076 SDValue RHS = N1->getOperand(0);
20077 MVT RHSTy = RHS.getValueType().getSimpleVT();
20078 // If the RHS is not a vector, this is not the pattern we're looking for.
20079 if (!RHSTy.isVector())
20080 return SDValue();
20081
20082 LLVM_DEBUG(
20083 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20084
20085 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20086 RHSTy.getVectorNumElements() * 2);
20087 return DAG.getNode(ISD::BITCAST, dl, VT,
20088 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
20089 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
20090 RHS));
20091}
20092
20093static SDValue
20095 SelectionDAG &DAG) {
20096 if (DCI.isBeforeLegalizeOps())
20097 return SDValue();
20098
20099 EVT VT = N->getValueType(0);
20100 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20101 return SDValue();
20102
20103 SDValue V = N->getOperand(0);
20104
20105 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20106 // blocks this combine because the non-const case requires custom lowering.
20107 //
20108 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20109 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20110 if (isa<ConstantSDNode>(V.getOperand(0)))
20111 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20112
20113 return SDValue();
20114}
20115
20116static SDValue
20118 SelectionDAG &DAG) {
20119 SDLoc DL(N);
20120 SDValue Vec = N->getOperand(0);
20121 SDValue SubVec = N->getOperand(1);
20122 uint64_t IdxVal = N->getConstantOperandVal(2);
20123 EVT VecVT = Vec.getValueType();
20124 EVT SubVT = SubVec.getValueType();
20125
20126 // Only do this for legal fixed vector types.
20127 if (!VecVT.isFixedLengthVector() ||
20128 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20129 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20130 return SDValue();
20131
20132 // Ignore widening patterns.
20133 if (IdxVal == 0 && Vec.isUndef())
20134 return SDValue();
20135
20136 // Subvector must be half the width and an "aligned" insertion.
20137 unsigned NumSubElts = SubVT.getVectorNumElements();
20138 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20139 (IdxVal != 0 && IdxVal != NumSubElts))
20140 return SDValue();
20141
20142 // Fold insert_subvector -> concat_vectors
20143 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20144 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20145 SDValue Lo, Hi;
20146 if (IdxVal == 0) {
20147 Lo = SubVec;
20148 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20149 DAG.getVectorIdxConstant(NumSubElts, DL));
20150 } else {
20151 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20152 DAG.getVectorIdxConstant(0, DL));
20153 Hi = SubVec;
20154 }
20155 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20156}
20157
20160 SelectionDAG &DAG) {
20161 // Wait until after everything is legalized to try this. That way we have
20162 // legal vector types and such.
20163 if (DCI.isBeforeLegalizeOps())
20164 return SDValue();
20165 // Transform a scalar conversion of a value from a lane extract into a
20166 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20167 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20168 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20169 //
20170 // The second form interacts better with instruction selection and the
20171 // register allocator to avoid cross-class register copies that aren't
20172 // coalescable due to a lane reference.
20173
20174 // Check the operand and see if it originates from a lane extract.
20175 SDValue Op1 = N->getOperand(1);
20177 return SDValue();
20178
20179 // Yep, no additional predication needed. Perform the transform.
20180 SDValue IID = N->getOperand(0);
20181 SDValue Shift = N->getOperand(2);
20182 SDValue Vec = Op1.getOperand(0);
20183 SDValue Lane = Op1.getOperand(1);
20184 EVT ResTy = N->getValueType(0);
20185 EVT VecResTy;
20186 SDLoc DL(N);
20187
20188 // The vector width should be 128 bits by the time we get here, even
20189 // if it started as 64 bits (the extract_vector handling will have
20190 // done so). Bail if it is not.
20191 if (Vec.getValueSizeInBits() != 128)
20192 return SDValue();
20193
20194 if (Vec.getValueType() == MVT::v4i32)
20195 VecResTy = MVT::v4f32;
20196 else if (Vec.getValueType() == MVT::v2i64)
20197 VecResTy = MVT::v2f64;
20198 else
20199 return SDValue();
20200
20201 SDValue Convert =
20202 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20203 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20204}
20205
20206// AArch64 high-vector "long" operations are formed by performing the non-high
20207// version on an extract_subvector of each operand which gets the high half:
20208//
20209// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20210//
20211// However, there are cases which don't have an extract_high explicitly, but
20212// have another operation that can be made compatible with one for free. For
20213// example:
20214//
20215// (dupv64 scalar) --> (extract_high (dup128 scalar))
20216//
20217// This routine does the actual conversion of such DUPs, once outer routines
20218// have determined that everything else is in order.
20219// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20220// similarly here.
20222 MVT VT = N.getSimpleValueType();
20223 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20224 N.getConstantOperandVal(1) == 0)
20225 N = N.getOperand(0);
20226
20227 switch (N.getOpcode()) {
20228 case AArch64ISD::DUP:
20233 case AArch64ISD::MOVI:
20239 break;
20240 default:
20241 // FMOV could be supported, but isn't very useful, as it would only occur
20242 // if you passed a bitcast' floating point immediate to an eligible long
20243 // integer op (addl, smull, ...).
20244 return SDValue();
20245 }
20246
20247 if (!VT.is64BitVector())
20248 return SDValue();
20249
20250 SDLoc DL(N);
20251 unsigned NumElems = VT.getVectorNumElements();
20252 if (N.getValueType().is64BitVector()) {
20253 MVT ElementTy = VT.getVectorElementType();
20254 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20255 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20256 }
20257
20258 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20259 DAG.getConstant(NumElems, DL, MVT::i64));
20260}
20261
20263 if (N.getOpcode() == ISD::BITCAST)
20264 N = N.getOperand(0);
20265 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20266 return false;
20267 if (N.getOperand(0).getValueType().isScalableVector())
20268 return false;
20269 return N.getConstantOperandAPInt(1) ==
20270 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20271}
20272
20273/// Helper structure to keep track of ISD::SET_CC operands.
20278};
20279
20280/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20282 const SDValue *Cmp;
20284};
20285
20286/// Helper structure to keep track of SetCC information.
20290};
20291
20292/// Helper structure to be able to read SetCC information. If set to
20293/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20294/// GenericSetCCInfo.
20298};
20299
20300/// Check whether or not \p Op is a SET_CC operation, either a generic or
20301/// an
20302/// AArch64 lowered one.
20303/// \p SetCCInfo is filled accordingly.
20304/// \post SetCCInfo is meanginfull only when this function returns true.
20305/// \return True when Op is a kind of SET_CC operation.
20307 // If this is a setcc, this is straight forward.
20308 if (Op.getOpcode() == ISD::SETCC) {
20309 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
20310 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
20311 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
20312 SetCCInfo.IsAArch64 = false;
20313 return true;
20314 }
20315 // Otherwise, check if this is a matching csel instruction.
20316 // In other words:
20317 // - csel 1, 0, cc
20318 // - csel 0, 1, !cc
20319 if (Op.getOpcode() != AArch64ISD::CSEL)
20320 return false;
20321 // Set the information about the operands.
20322 // TODO: we want the operands of the Cmp not the csel
20323 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
20324 SetCCInfo.IsAArch64 = true;
20325 SetCCInfo.Info.AArch64.CC =
20326 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20327
20328 // Check that the operands matches the constraints:
20329 // (1) Both operands must be constants.
20330 // (2) One must be 1 and the other must be 0.
20331 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
20332 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20333
20334 // Check (1).
20335 if (!TValue || !FValue)
20336 return false;
20337
20338 // Check (2).
20339 if (!TValue->isOne()) {
20340 // Update the comparison when we are interested in !cc.
20341 std::swap(TValue, FValue);
20342 SetCCInfo.Info.AArch64.CC =
20344 }
20345 return TValue->isOne() && FValue->isZero();
20346}
20347
20348// Returns true if Op is setcc or zext of setcc.
20349static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20350 if (isSetCC(Op, Info))
20351 return true;
20352 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20353 isSetCC(Op->getOperand(0), Info));
20354}
20355
20356// The folding we want to perform is:
20357// (add x, [zext] (setcc cc ...) )
20358// -->
20359// (csel x, (add x, 1), !cc ...)
20360//
20361// The latter will get matched to a CSINC instruction.
20363 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20364 SDValue LHS = Op->getOperand(0);
20365 SDValue RHS = Op->getOperand(1);
20366 SetCCInfoAndKind InfoAndKind;
20367
20368 // If both operands are a SET_CC, then we don't want to perform this
20369 // folding and create another csel as this results in more instructions
20370 // (and higher register usage).
20371 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
20372 isSetCCOrZExtSetCC(RHS, InfoAndKind))
20373 return SDValue();
20374
20375 // If neither operand is a SET_CC, give up.
20376 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
20377 std::swap(LHS, RHS);
20378 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
20379 return SDValue();
20380 }
20381
20382 // FIXME: This could be generatized to work for FP comparisons.
20383 EVT CmpVT = InfoAndKind.IsAArch64
20384 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
20385 : InfoAndKind.Info.Generic.Opnd0->getValueType();
20386 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20387 return SDValue();
20388
20389 SDValue CCVal;
20390 SDValue Cmp;
20391 SDLoc dl(Op);
20392 if (InfoAndKind.IsAArch64) {
20393 CCVal = DAG.getConstant(
20395 MVT::i32);
20396 Cmp = *InfoAndKind.Info.AArch64.Cmp;
20397 } else
20398 Cmp = getAArch64Cmp(
20399 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
20400 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
20401 dl);
20402
20403 EVT VT = Op->getValueType(0);
20404 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
20405 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
20406}
20407
20408// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
20410 EVT VT = N->getValueType(0);
20411 // Only scalar integer and vector types.
20412 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
20413 return SDValue();
20414
20415 SDValue LHS = N->getOperand(0);
20416 SDValue RHS = N->getOperand(1);
20417 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20418 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
20419 return SDValue();
20420
20421 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
20422 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
20423 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
20424 return SDValue();
20425
20426 SDValue Op1 = LHS->getOperand(0);
20427 SDValue Op2 = RHS->getOperand(0);
20428 EVT OpVT1 = Op1.getValueType();
20429 EVT OpVT2 = Op2.getValueType();
20430 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
20431 Op2.getOpcode() != AArch64ISD::UADDV ||
20432 OpVT1.getVectorElementType() != VT)
20433 return SDValue();
20434
20435 SDValue Val1 = Op1.getOperand(0);
20436 SDValue Val2 = Op2.getOperand(0);
20437 EVT ValVT = Val1->getValueType(0);
20438 SDLoc DL(N);
20439 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
20440 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
20441 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
20442 DAG.getConstant(0, DL, MVT::i64));
20443}
20444
20445/// Perform the scalar expression combine in the form of:
20446/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20447/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20449 EVT VT = N->getValueType(0);
20450 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
20451 return SDValue();
20452
20453 SDValue LHS = N->getOperand(0);
20454 SDValue RHS = N->getOperand(1);
20455
20456 // Handle commutivity.
20457 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20458 LHS.getOpcode() != AArch64ISD::CSNEG) {
20459 std::swap(LHS, RHS);
20460 if (LHS.getOpcode() != AArch64ISD::CSEL &&
20461 LHS.getOpcode() != AArch64ISD::CSNEG) {
20462 return SDValue();
20463 }
20464 }
20465
20466 if (!LHS.hasOneUse())
20467 return SDValue();
20468
20469 AArch64CC::CondCode AArch64CC =
20470 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
20471
20472 // The CSEL should include a const one operand, and the CSNEG should include
20473 // One or NegOne operand.
20474 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
20475 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
20476 if (!CTVal || !CFVal)
20477 return SDValue();
20478
20479 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20480 (CTVal->isOne() || CFVal->isOne())) &&
20481 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
20482 (CTVal->isOne() || CFVal->isAllOnes())))
20483 return SDValue();
20484
20485 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20486 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20487 !CFVal->isOne()) {
20488 std::swap(CTVal, CFVal);
20489 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20490 }
20491
20492 SDLoc DL(N);
20493 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20494 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20495 !CFVal->isAllOnes()) {
20496 APInt C = -1 * CFVal->getAPIntValue();
20497 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
20498 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
20499 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
20500 }
20501
20502 // It might be neutral for larger constants, as the immediate need to be
20503 // materialized in a register.
20504 APInt ADDC = CTVal->getAPIntValue();
20505 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20506 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20507 return SDValue();
20508
20509 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
20510 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20511 "Unexpected constant value");
20512
20513 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
20514 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
20515 SDValue Cmp = LHS.getOperand(3);
20516
20517 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
20518}
20519
20520// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
20522 EVT VT = N->getValueType(0);
20523 if (N->getOpcode() != ISD::ADD)
20524 return SDValue();
20525
20526 SDValue Dot = N->getOperand(0);
20527 SDValue A = N->getOperand(1);
20528 // Handle commutivity
20529 auto isZeroDot = [](SDValue Dot) {
20530 return (Dot.getOpcode() == AArch64ISD::UDOT ||
20531 Dot.getOpcode() == AArch64ISD::SDOT) &&
20533 };
20534 if (!isZeroDot(Dot))
20535 std::swap(Dot, A);
20536 if (!isZeroDot(Dot))
20537 return SDValue();
20538
20539 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
20540 Dot.getOperand(2));
20541}
20542
20544 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
20545}
20546
20548 SDLoc DL(Op);
20549 EVT VT = Op.getValueType();
20550 SDValue Zero = DAG.getConstant(0, DL, VT);
20551 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
20552}
20553
20554// Try to fold
20555//
20556// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
20557//
20558// The folding helps csel to be matched with csneg without generating
20559// redundant neg instruction, which includes negation of the csel expansion
20560// of abs node lowered by lowerABS.
20562 if (!isNegatedInteger(SDValue(N, 0)))
20563 return SDValue();
20564
20565 SDValue CSel = N->getOperand(1);
20566 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
20567 return SDValue();
20568
20569 SDValue N0 = CSel.getOperand(0);
20570 SDValue N1 = CSel.getOperand(1);
20571
20572 // If both of them is not negations, it's not worth the folding as it
20573 // introduces two additional negations while reducing one negation.
20574 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
20575 return SDValue();
20576
20577 SDValue N0N = getNegatedInteger(N0, DAG);
20578 SDValue N1N = getNegatedInteger(N1, DAG);
20579
20580 SDLoc DL(N);
20581 EVT VT = CSel.getValueType();
20582 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
20583 CSel.getOperand(3));
20584}
20585
20586// The basic add/sub long vector instructions have variants with "2" on the end
20587// which act on the high-half of their inputs. They are normally matched by
20588// patterns like:
20589//
20590// (add (zeroext (extract_high LHS)),
20591// (zeroext (extract_high RHS)))
20592// -> uaddl2 vD, vN, vM
20593//
20594// However, if one of the extracts is something like a duplicate, this
20595// instruction can still be used profitably. This function puts the DAG into a
20596// more appropriate form for those patterns to trigger.
20599 SelectionDAG &DAG = DCI.DAG;
20600 if (DCI.isBeforeLegalizeOps())
20601 return SDValue();
20602
20603 MVT VT = N->getSimpleValueType(0);
20604 if (!VT.is128BitVector()) {
20605 if (N->getOpcode() == ISD::ADD)
20606 return performSetccAddFolding(N, DAG);
20607 return SDValue();
20608 }
20609
20610 // Make sure both branches are extended in the same way.
20611 SDValue LHS = N->getOperand(0);
20612 SDValue RHS = N->getOperand(1);
20613 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20614 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20615 LHS.getOpcode() != RHS.getOpcode())
20616 return SDValue();
20617
20618 unsigned ExtType = LHS.getOpcode();
20619
20620 // It's not worth doing if at least one of the inputs isn't already an
20621 // extract, but we don't know which it'll be so we have to try both.
20622 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
20623 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
20624 if (!RHS.getNode())
20625 return SDValue();
20626
20627 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
20628 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
20629 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
20630 if (!LHS.getNode())
20631 return SDValue();
20632
20633 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
20634 }
20635
20636 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
20637}
20638
20639static bool isCMP(SDValue Op) {
20640 return Op.getOpcode() == AArch64ISD::SUBS &&
20641 !Op.getNode()->hasAnyUseOfValue(0);
20642}
20643
20644// (CSEL 1 0 CC Cond) => CC
20645// (CSEL 0 1 CC Cond) => !CC
20646static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20647 if (Op.getOpcode() != AArch64ISD::CSEL)
20648 return std::nullopt;
20649 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20650 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
20651 return std::nullopt;
20652 SDValue OpLHS = Op.getOperand(0);
20653 SDValue OpRHS = Op.getOperand(1);
20654 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
20655 return CC;
20656 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
20657 return getInvertedCondCode(CC);
20658
20659 return std::nullopt;
20660}
20661
20662// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20663// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20664static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
20665 SDValue CmpOp = Op->getOperand(2);
20666 if (!isCMP(CmpOp))
20667 return SDValue();
20668
20669 if (IsAdd) {
20670 if (!isOneConstant(CmpOp.getOperand(1)))
20671 return SDValue();
20672 } else {
20673 if (!isNullConstant(CmpOp.getOperand(0)))
20674 return SDValue();
20675 }
20676
20677 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
20678 auto CC = getCSETCondCode(CsetOp);
20679 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20680 return SDValue();
20681
20682 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
20683 Op->getOperand(0), Op->getOperand(1),
20684 CsetOp.getOperand(3));
20685}
20686
20687// (ADC x 0 cond) => (CINC x HS cond)
20689 SDValue LHS = N->getOperand(0);
20690 SDValue RHS = N->getOperand(1);
20691 SDValue Cond = N->getOperand(2);
20692
20693 if (!isNullConstant(RHS))
20694 return SDValue();
20695
20696 EVT VT = N->getValueType(0);
20697 SDLoc DL(N);
20698
20699 // (CINC x cc cond) <=> (CSINC x x !cc cond)
20700 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
20701 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
20702}
20703
20706 SelectionDAG &DAG) {
20707 SDLoc DL(N);
20708 EVT VT = N->getValueType(0);
20709
20711 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
20712 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
20713 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
20714 if (Elt0->getOpcode() == ISD::FP_ROUND &&
20715 Elt1->getOpcode() == ISD::FP_ROUND &&
20716 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20717 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20718 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
20720 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20721 // Constant index.
20722 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20723 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20724 Elt0->getOperand(0)->getOperand(0) ==
20725 Elt1->getOperand(0)->getOperand(0) &&
20726 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
20727 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
20728 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
20729 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20730 SDValue HighLanes;
20731 if (Elt2->getOpcode() == ISD::UNDEF &&
20732 Elt3->getOpcode() == ISD::UNDEF) {
20733 HighLanes = DAG.getUNDEF(MVT::v2f32);
20734 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
20735 Elt3->getOpcode() == ISD::FP_ROUND &&
20736 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
20737 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
20738 Elt2->getConstantOperandVal(1) ==
20739 Elt3->getConstantOperandVal(1) &&
20740 Elt2->getOperand(0)->getOpcode() ==
20742 Elt3->getOperand(0)->getOpcode() ==
20744 // Constant index.
20745 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
20746 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
20747 Elt2->getOperand(0)->getOperand(0) ==
20748 Elt3->getOperand(0)->getOperand(0) &&
20749 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
20750 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
20751 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
20752 HighLanes =
20753 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
20754 }
20755 if (HighLanes) {
20756 SDValue DoubleToSingleSticky =
20757 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
20758 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
20759 DoubleToSingleSticky, HighLanes);
20760 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
20761 Elt0->getOperand(1));
20762 }
20763 }
20764 }
20765 }
20766
20767 if (VT == MVT::v2f64) {
20768 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20769 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
20770 Elt1->getOpcode() == ISD::FP_EXTEND &&
20772 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20773 Elt0->getOperand(0)->getOperand(0) ==
20774 Elt1->getOperand(0)->getOperand(0) &&
20775 // Constant index.
20776 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20777 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20778 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
20779 Elt1->getOperand(0)->getConstantOperandVal(1) &&
20780 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20781 // ResultType's known minimum vector length.
20782 Elt0->getOperand(0)->getConstantOperandVal(1) %
20784 0) {
20785 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
20786 if (SrcVec.getValueType() == MVT::v4f16 ||
20787 SrcVec.getValueType() == MVT::v4bf16) {
20788 SDValue HalfToSingle =
20789 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
20790 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
20791 SDValue Extract = DAG.getNode(
20793 HalfToSingle, SubvectorIdx);
20794 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
20795 }
20796 }
20797 }
20798
20799 // A build vector of two extracted elements is equivalent to an
20800 // extract subvector where the inner vector is any-extended to the
20801 // extract_vector_elt VT.
20802 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20803 // (extract_elt_iXX_to_i32 vec Idx+1))
20804 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20805
20806 // For now, only consider the v2i32 case, which arises as a result of
20807 // legalization.
20808 if (VT != MVT::v2i32)
20809 return SDValue();
20810
20811 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20812 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20813 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20814 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20815 // Constant index.
20816 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20817 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20818 // Both EXTRACT_VECTOR_ELT from same vector...
20819 Elt0->getOperand(0) == Elt1->getOperand(0) &&
20820 // ... and contiguous. First element's index +1 == second element's index.
20821 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
20822 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20823 // ResultType's known minimum vector length.
20824 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
20825 SDValue VecToExtend = Elt0->getOperand(0);
20826 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
20827 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
20828 return SDValue();
20829
20830 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
20831
20832 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
20833 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
20834 SubvectorIdx);
20835 }
20836
20837 return SDValue();
20838}
20839
20842 SDLoc DL(N);
20843 EVT VT = N->getValueType(0);
20844 SDValue N0 = N->getOperand(0);
20845 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20846 N0.getOpcode() == AArch64ISD::DUP) {
20847 SDValue Op = N0.getOperand(0);
20848 if (VT.getScalarType() == MVT::i32 &&
20849 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20850 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
20851 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
20852 }
20853
20854 // Performing the following combine produces a preferable form for ISEL.
20855 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
20857 N0.hasOneUse()) {
20858 SDValue Op = N0.getOperand(0);
20859 SDValue ExtractIndexNode = N0.getOperand(1);
20860 if (!isa<ConstantSDNode>(ExtractIndexNode))
20861 return SDValue();
20862
20863 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
20864 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
20865 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
20866 "Unexpected legalisation result!");
20867
20868 EVT SrcVectorType = Op.getValueType();
20869 // We also assume that SrcVectorType cannot be a V64 (see
20870 // LowerEXTRACT_VECTOR_ELT).
20871 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
20872 "Unexpected legalisation result!");
20873
20874 unsigned ExtractIndex =
20875 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
20876 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
20877
20878 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
20879 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
20880 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
20881 }
20882
20883 return SDValue();
20884}
20885
20886// Check an node is an extend or shift operand
20888 unsigned Opcode = N.getOpcode();
20889 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20890 EVT SrcVT;
20891 if (Opcode == ISD::SIGN_EXTEND_INREG)
20892 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
20893 else
20894 SrcVT = N.getOperand(0).getValueType();
20895
20896 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20897 } else if (Opcode == ISD::AND) {
20898 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
20899 if (!CSD)
20900 return false;
20901 uint64_t AndMask = CSD->getZExtValue();
20902 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20903 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20904 return isa<ConstantSDNode>(N.getOperand(1));
20905 }
20906
20907 return false;
20908}
20909
20910// (N - Y) + Z --> (Z - Y) + N
20911// when N is an extend or shift operand
20913 SelectionDAG &DAG) {
20914 auto IsOneUseExtend = [](SDValue N) {
20915 return N.hasOneUse() && isExtendOrShiftOperand(N);
20916 };
20917
20918 // DAGCombiner will revert the combination when Z is constant cause
20919 // dead loop. So don't enable the combination when Z is constant.
20920 // If Z is one use shift C, we also can't do the optimization.
20921 // It will falling to self infinite loop.
20922 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
20923 return SDValue();
20924
20925 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20926 return SDValue();
20927
20928 SDValue Shift = SUB.getOperand(0);
20929 if (!IsOneUseExtend(Shift))
20930 return SDValue();
20931
20932 SDLoc DL(N);
20933 EVT VT = N->getValueType(0);
20934
20935 SDValue Y = SUB.getOperand(1);
20936 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
20937 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
20938}
20939
20941 SelectionDAG &DAG) {
20942 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20943 // commutative.
20944 if (N->getOpcode() != ISD::ADD)
20945 return SDValue();
20946
20947 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20948 // shifted register is only available for i32 and i64.
20949 EVT VT = N->getValueType(0);
20950 if (VT != MVT::i32 && VT != MVT::i64)
20951 return SDValue();
20952
20953 SDLoc DL(N);
20954 SDValue LHS = N->getOperand(0);
20955 SDValue RHS = N->getOperand(1);
20956
20957 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
20958 return Val;
20959 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
20960 return Val;
20961
20962 uint64_t LHSImm = 0, RHSImm = 0;
20963 // If both operand are shifted by imm and shift amount is not greater than 4
20964 // for one operand, swap LHS and RHS to put operand with smaller shift amount
20965 // on RHS.
20966 //
20967 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20968 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20969 // with LSL (shift > 4). For the rest of processors, this is no-op for
20970 // performance or correctness.
20971 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
20972 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
20973 RHSImm > 4 && LHS.hasOneUse())
20974 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
20975
20976 return SDValue();
20977}
20978
20979// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20980// This reassociates it back to allow the creation of more mls instructions.
20982 if (N->getOpcode() != ISD::SUB)
20983 return SDValue();
20984
20985 SDValue Add = N->getOperand(1);
20986 SDValue X = N->getOperand(0);
20987 if (Add.getOpcode() != ISD::ADD)
20988 return SDValue();
20989
20990 if (!Add.hasOneUse())
20991 return SDValue();
20993 return SDValue();
20994
20995 SDValue M1 = Add.getOperand(0);
20996 SDValue M2 = Add.getOperand(1);
20997 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
20998 M1.getOpcode() != AArch64ISD::UMULL)
20999 return SDValue();
21000 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21002 return SDValue();
21003
21004 EVT VT = N->getValueType(0);
21005 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21006 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21007}
21008
21009// Combine into mla/mls.
21010// This works on the patterns of:
21011// add v1, (mul v2, v3)
21012// sub v1, (mul v2, v3)
21013// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21014// It will transform the add/sub to a scalable version, so that we can
21015// make use of SVE's MLA/MLS that will be generated for that pattern
21016static SDValue
21018 SelectionDAG &DAG = DCI.DAG;
21019 // Make sure that the types are legal
21020 if (!DCI.isAfterLegalizeDAG())
21021 return SDValue();
21022 // Before using SVE's features, check first if it's available.
21023 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21024 return SDValue();
21025
21026 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21027 return SDValue();
21028
21029 if (!N->getValueType(0).isFixedLengthVector())
21030 return SDValue();
21031
21032 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21033 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21034 return SDValue();
21035
21036 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21037 return SDValue();
21038
21039 SDValue MulValue = Op1->getOperand(0);
21040 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21041 return SDValue();
21042
21043 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21044 return SDValue();
21045
21046 EVT ScalableVT = MulValue.getValueType();
21047 if (!ScalableVT.isScalableVector())
21048 return SDValue();
21049
21050 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21051 SDValue NewValue =
21052 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21053 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21054 };
21055
21056 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21057 return res;
21058 else if (N->getOpcode() == ISD::ADD)
21059 return performOpt(N->getOperand(1), N->getOperand(0));
21060
21061 return SDValue();
21062}
21063
21064// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21065// help, for example, to produce ssra from sshr+add.
21067 EVT VT = N->getValueType(0);
21068 if (VT != MVT::i64 ||
21069 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21070 return SDValue();
21071 SDValue Op0 = N->getOperand(0);
21072 SDValue Op1 = N->getOperand(1);
21073
21074 // At least one of the operands should be an extract, and the other should be
21075 // something that is easy to convert to v1i64 type (in this case a load).
21076 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21077 Op0.getOpcode() != ISD::LOAD)
21078 return SDValue();
21079 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21080 Op1.getOpcode() != ISD::LOAD)
21081 return SDValue();
21082
21083 SDLoc DL(N);
21084 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21085 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21086 Op0 = Op0.getOperand(0);
21087 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21088 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21089 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21090 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21091 Op1 = Op1.getOperand(0);
21092 } else
21093 return SDValue();
21094
21095 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21096 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21097 DAG.getConstant(0, DL, MVT::i64));
21098}
21099
21102 if (!BV->hasOneUse())
21103 return false;
21104 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21105 if (!Ld || !Ld->isSimple())
21106 return false;
21107 Loads.push_back(Ld);
21108 return true;
21109 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21111 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21112 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21113 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21114 return false;
21115 Loads.push_back(Ld);
21116 }
21117 return true;
21118 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21119 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21120 // are lowered. Note that this only comes up because we do not always visit
21121 // operands before uses. After that is fixed this can be removed and in the
21122 // meantime this is fairly specific to the lowering we expect from IR.
21123 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21124 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21125 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21126 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21127 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21128 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21129 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21130 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21131 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21132 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21133 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21134 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21135 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21136 B.getOperand(1).getNumOperands() != 4)
21137 return false;
21138 auto SV1 = cast<ShuffleVectorSDNode>(B);
21139 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21140 int NumElts = B.getValueType().getVectorNumElements();
21141 int NumSubElts = NumElts / 4;
21142 for (int I = 0; I < NumSubElts; I++) {
21143 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21144 if (SV1->getMaskElt(I) != I ||
21145 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21146 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21147 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21148 return false;
21149 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21150 if (SV2->getMaskElt(I) != I ||
21151 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21152 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21153 return false;
21154 }
21155 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21156 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21157 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21158 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21159 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21160 !Ld2->isSimple() || !Ld3->isSimple())
21161 return false;
21162 Loads.push_back(Ld0);
21163 Loads.push_back(Ld1);
21164 Loads.push_back(Ld2);
21165 Loads.push_back(Ld3);
21166 return true;
21167 }
21168 return false;
21169}
21170
21172 SelectionDAG &DAG,
21173 unsigned &NumSubLoads) {
21174 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21175 return false;
21176
21177 SmallVector<LoadSDNode *> Loads0, Loads1;
21178 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21179 isLoadOrMultipleLoads(Op1, Loads1)) {
21180 if (NumSubLoads && Loads0.size() != NumSubLoads)
21181 return false;
21182 NumSubLoads = Loads0.size();
21183 return Loads0.size() == Loads1.size() &&
21184 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21185 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
21186 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
21187 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
21188 Size / 8, 1);
21189 });
21190 }
21191
21192 if (Op0.getOpcode() != Op1.getOpcode())
21193 return false;
21194
21195 switch (Op0.getOpcode()) {
21196 case ISD::ADD:
21197 case ISD::SUB:
21199 DAG, NumSubLoads) &&
21201 DAG, NumSubLoads);
21202 case ISD::SIGN_EXTEND:
21203 case ISD::ANY_EXTEND:
21204 case ISD::ZERO_EXTEND:
21205 EVT XVT = Op0.getOperand(0).getValueType();
21206 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
21207 XVT.getScalarSizeInBits() != 32)
21208 return false;
21210 DAG, NumSubLoads);
21211 }
21212 return false;
21213}
21214
21215// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21216// into a single load of twice the size, that we extract the bottom part and top
21217// part so that the shl can use a shll2 instruction. The two loads in that
21218// example can also be larger trees of instructions, which are identical except
21219// for the leaves which are all loads offset from the LHS, including
21220// buildvectors of multiple loads. For example the RHS tree could be
21221// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21222// Whilst it can be common for the larger loads to replace LDP instructions
21223// (which doesn't gain anything on it's own), the larger loads can help create
21224// more efficient code, and in buildvectors prevent the need for ld1 lane
21225// inserts which can be slower than normal loads.
21227 EVT VT = N->getValueType(0);
21228 if (!VT.isFixedLengthVector() ||
21229 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
21230 VT.getScalarSizeInBits() != 64))
21231 return SDValue();
21232
21233 SDValue Other = N->getOperand(0);
21234 SDValue Shift = N->getOperand(1);
21235 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21236 std::swap(Shift, Other);
21237 APInt ShiftAmt;
21238 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
21239 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
21240 return SDValue();
21241
21242 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
21243 !ISD::isExtOpcode(Other.getOpcode()) ||
21244 Shift.getOperand(0).getOperand(0).getValueType() !=
21245 Other.getOperand(0).getValueType() ||
21246 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
21247 return SDValue();
21248
21249 SDValue Op0 = Other.getOperand(0);
21250 SDValue Op1 = Shift.getOperand(0).getOperand(0);
21251
21252 unsigned NumSubLoads = 0;
21253 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21254 return SDValue();
21255
21256 // Attempt to rule out some unprofitable cases using heuristics (some working
21257 // around suboptimal code generation), notably if the extend not be able to
21258 // use ushll2 instructions as the types are not large enough. Otherwise zip's
21259 // will need to be created which can increase the instruction count.
21260 unsigned NumElts = Op0.getValueType().getVectorNumElements();
21261 unsigned NumSubElts = NumElts / NumSubLoads;
21262 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
21263 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
21264 Op0.getValueType().getSizeInBits() < 128 &&
21266 return SDValue();
21267
21268 // Recreate the tree with the new combined loads.
21269 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21270 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21271 EVT DVT =
21273
21274 SmallVector<LoadSDNode *> Loads0, Loads1;
21275 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21276 isLoadOrMultipleLoads(Op1, Loads1)) {
21277 EVT LoadVT = EVT::getVectorVT(
21278 *DAG.getContext(), Op0.getValueType().getScalarType(),
21279 Op0.getValueType().getVectorNumElements() / Loads0.size());
21280 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21281
21282 SmallVector<SDValue> NewLoads;
21283 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
21284 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
21285 L0->getBasePtr(), L0->getPointerInfo(),
21286 L0->getOriginalAlign());
21287 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
21288 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
21289 NewLoads.push_back(Load);
21290 }
21291 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
21292 }
21293
21295 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
21296 Ops.push_back(GenCombinedTree(O0, O1, DAG));
21297 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
21298 };
21299 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
21300
21301 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
21302 int Hi = NumSubElts, Lo = 0;
21303 for (unsigned i = 0; i < NumSubLoads; i++) {
21304 for (unsigned j = 0; j < NumSubElts; j++) {
21305 LowMask[i * NumSubElts + j] = Lo++;
21306 HighMask[i * NumSubElts + j] = Hi++;
21307 }
21308 Lo += NumSubElts;
21309 Hi += NumSubElts;
21310 }
21311 SDLoc DL(N);
21312 SDValue Ext0, Ext1;
21313 // Extract the top and bottom lanes, then extend the result. Possibly extend
21314 // the result then extract the lanes if the two operands match as it produces
21315 // slightly smaller code.
21316 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
21318 NewOp, DAG.getConstant(0, DL, MVT::i64));
21319 SDValue SubH =
21320 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
21321 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21322 SDValue Extr0 =
21323 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
21324 SDValue Extr1 =
21325 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
21326 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
21327 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
21328 } else {
21330 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
21331 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21332 DAG.getConstant(0, DL, MVT::i64));
21333 SDValue SubH =
21334 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
21335 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
21336 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
21337 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
21338 }
21339 SDValue NShift =
21340 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
21341 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
21342}
21343
21346 // Try to change sum of two reductions.
21347 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
21348 return Val;
21349 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
21350 return Val;
21351 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
21352 return Val;
21353 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
21354 return Val;
21355 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
21356 return Val;
21358 return Val;
21359 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
21360 return Val;
21361 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21362 return Val;
21363 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
21364 return Val;
21365
21366 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
21367 return Val;
21368
21369 return performAddSubLongCombine(N, DCI);
21370}
21371
21372// Massage DAGs which we can use the high-half "long" operations on into
21373// something isel will recognize better. E.g.
21374//
21375// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21376// (aarch64_neon_umull (extract_high (v2i64 vec)))
21377// (extract_high (v2i64 (dup128 scalar)))))
21378//
21381 SelectionDAG &DAG) {
21382 if (DCI.isBeforeLegalizeOps())
21383 return SDValue();
21384
21385 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
21386 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
21387 assert(LHS.getValueType().is64BitVector() &&
21388 RHS.getValueType().is64BitVector() &&
21389 "unexpected shape for long operation");
21390
21391 // Either node could be a DUP, but it's not worth doing both of them (you'd
21392 // just as well use the non-high version) so look for a corresponding extract
21393 // operation on the other "wing".
21396 if (!RHS.getNode())
21397 return SDValue();
21400 if (!LHS.getNode())
21401 return SDValue();
21402 } else
21403 return SDValue();
21404
21405 if (IID == Intrinsic::not_intrinsic)
21406 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
21407
21408 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
21409 N->getOperand(0), LHS, RHS);
21410}
21411
21412static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21413 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
21414 unsigned ElemBits = ElemTy.getSizeInBits();
21415
21416 int64_t ShiftAmount;
21417 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
21418 APInt SplatValue, SplatUndef;
21419 unsigned SplatBitSize;
21420 bool HasAnyUndefs;
21421 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21422 HasAnyUndefs, ElemBits) ||
21423 SplatBitSize != ElemBits)
21424 return SDValue();
21425
21426 ShiftAmount = SplatValue.getSExtValue();
21427 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
21428 ShiftAmount = CVN->getSExtValue();
21429 } else
21430 return SDValue();
21431
21432 // If the shift amount is zero, remove the shift intrinsic.
21433 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
21434 return N->getOperand(1);
21435
21436 unsigned Opcode;
21437 bool IsRightShift;
21438 switch (IID) {
21439 default:
21440 llvm_unreachable("Unknown shift intrinsic");
21441 case Intrinsic::aarch64_neon_sqshl:
21442 Opcode = AArch64ISD::SQSHL_I;
21443 IsRightShift = false;
21444 break;
21445 case Intrinsic::aarch64_neon_uqshl:
21446 Opcode = AArch64ISD::UQSHL_I;
21447 IsRightShift = false;
21448 break;
21449 case Intrinsic::aarch64_neon_srshl:
21450 Opcode = AArch64ISD::SRSHR_I;
21451 IsRightShift = true;
21452 break;
21453 case Intrinsic::aarch64_neon_urshl:
21454 Opcode = AArch64ISD::URSHR_I;
21455 IsRightShift = true;
21456 break;
21457 case Intrinsic::aarch64_neon_sqshlu:
21458 Opcode = AArch64ISD::SQSHLU_I;
21459 IsRightShift = false;
21460 break;
21461 case Intrinsic::aarch64_neon_sshl:
21462 case Intrinsic::aarch64_neon_ushl:
21463 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
21464 // left shift for positive shift amounts. For negative shifts we can use a
21465 // VASHR/VLSHR as appropiate.
21466 if (ShiftAmount < 0) {
21467 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
21469 ShiftAmount = -ShiftAmount;
21470 } else
21471 Opcode = AArch64ISD::VSHL;
21472 IsRightShift = false;
21473 break;
21474 }
21475
21476 EVT VT = N->getValueType(0);
21477 SDValue Op = N->getOperand(1);
21478 SDLoc dl(N);
21479 if (VT == MVT::i64) {
21480 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
21481 VT = MVT::v1i64;
21482 }
21483
21484 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
21485 Op = DAG.getNode(Opcode, dl, VT, Op,
21486 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
21487 if (N->getValueType(0) == MVT::i64)
21488 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
21489 DAG.getConstant(0, dl, MVT::i64));
21490 return Op;
21491 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
21492 Op = DAG.getNode(Opcode, dl, VT, Op,
21493 DAG.getConstant(ShiftAmount, dl, MVT::i32));
21494 if (N->getValueType(0) == MVT::i64)
21495 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
21496 DAG.getConstant(0, dl, MVT::i64));
21497 return Op;
21498 }
21499
21500 return SDValue();
21501}
21502
21503// The CRC32[BH] instructions ignore the high bits of their data operand. Since
21504// the intrinsics must be legal and take an i32, this means there's almost
21505// certainly going to be a zext in the DAG which we can eliminate.
21506static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
21507 SDValue AndN = N->getOperand(2);
21508 if (AndN.getOpcode() != ISD::AND)
21509 return SDValue();
21510
21511 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
21512 if (!CMask || CMask->getZExtValue() != Mask)
21513 return SDValue();
21514
21515 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
21516 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
21517}
21518
21520 SelectionDAG &DAG) {
21521 SDLoc dl(N);
21522 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
21523 DAG.getNode(Opc, dl,
21524 N->getOperand(1).getSimpleValueType(),
21525 N->getOperand(1)),
21526 DAG.getConstant(0, dl, MVT::i64));
21527}
21528
21530 SDLoc DL(N);
21531 SDValue Op1 = N->getOperand(1);
21532 SDValue Op2 = N->getOperand(2);
21533 EVT ScalarTy = Op2.getValueType();
21534 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21535 ScalarTy = MVT::i32;
21536
21537 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
21538 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
21539 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
21540 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
21541 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
21542 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
21543}
21544
21546 SDLoc dl(N);
21547 SDValue Scalar = N->getOperand(3);
21548 EVT ScalarTy = Scalar.getValueType();
21549
21550 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21551 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
21552
21553 SDValue Passthru = N->getOperand(1);
21554 SDValue Pred = N->getOperand(2);
21555 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
21556 Pred, Scalar, Passthru);
21557}
21558
21560 SDLoc dl(N);
21561 LLVMContext &Ctx = *DAG.getContext();
21562 EVT VT = N->getValueType(0);
21563
21564 assert(VT.isScalableVector() && "Expected a scalable vector.");
21565
21566 // Current lowering only supports the SVE-ACLE types.
21568 return SDValue();
21569
21570 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
21571 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
21572 EVT ByteVT =
21573 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
21574
21575 // Convert everything to the domain of EXT (i.e bytes).
21576 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
21577 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
21578 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
21579 DAG.getConstant(ElemSize, dl, MVT::i32));
21580
21581 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
21582 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
21583}
21584
21587 SelectionDAG &DAG) {
21588 if (DCI.isBeforeLegalize())
21589 return SDValue();
21590
21591 SDValue Comparator = N->getOperand(3);
21592 if (Comparator.getOpcode() == AArch64ISD::DUP ||
21593 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
21594 unsigned IID = getIntrinsicID(N);
21595 EVT VT = N->getValueType(0);
21596 EVT CmpVT = N->getOperand(2).getValueType();
21597 SDValue Pred = N->getOperand(1);
21598 SDValue Imm;
21599 SDLoc DL(N);
21600
21601 switch (IID) {
21602 default:
21603 llvm_unreachable("Called with wrong intrinsic!");
21604 break;
21605
21606 // Signed comparisons
21607 case Intrinsic::aarch64_sve_cmpeq_wide:
21608 case Intrinsic::aarch64_sve_cmpne_wide:
21609 case Intrinsic::aarch64_sve_cmpge_wide:
21610 case Intrinsic::aarch64_sve_cmpgt_wide:
21611 case Intrinsic::aarch64_sve_cmplt_wide:
21612 case Intrinsic::aarch64_sve_cmple_wide: {
21613 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21614 int64_t ImmVal = CN->getSExtValue();
21615 if (ImmVal >= -16 && ImmVal <= 15)
21616 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
21617 else
21618 return SDValue();
21619 }
21620 break;
21621 }
21622 // Unsigned comparisons
21623 case Intrinsic::aarch64_sve_cmphs_wide:
21624 case Intrinsic::aarch64_sve_cmphi_wide:
21625 case Intrinsic::aarch64_sve_cmplo_wide:
21626 case Intrinsic::aarch64_sve_cmpls_wide: {
21627 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21628 uint64_t ImmVal = CN->getZExtValue();
21629 if (ImmVal <= 127)
21630 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
21631 else
21632 return SDValue();
21633 }
21634 break;
21635 }
21636 }
21637
21638 if (!Imm)
21639 return SDValue();
21640
21641 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
21642 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
21643 N->getOperand(2), Splat, DAG.getCondCode(CC));
21644 }
21645
21646 return SDValue();
21647}
21648
21651 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21652
21653 SDLoc DL(Op);
21654 assert(Op.getValueType().isScalableVector() &&
21655 TLI.isTypeLegal(Op.getValueType()) &&
21656 "Expected legal scalable vector type!");
21657 assert(Op.getValueType() == Pg.getValueType() &&
21658 "Expected same type for PTEST operands");
21659
21660 // Ensure target specific opcodes are using legal type.
21661 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
21662 SDValue TVal = DAG.getConstant(1, DL, OutVT);
21663 SDValue FVal = DAG.getConstant(0, DL, OutVT);
21664
21665 // Ensure operands have type nxv16i1.
21666 if (Op.getValueType() != MVT::nxv16i1) {
21669 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
21670 else
21671 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
21672 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
21673 }
21674
21675 // Set condition code (CC) flags.
21676 SDValue Test = DAG.getNode(
21678 DL, MVT::i32, Pg, Op);
21679
21680 // Convert CC to integer based on requested condition.
21681 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21682 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
21683 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
21684 return DAG.getZExtOrTrunc(Res, DL, VT);
21685}
21686
21688 SelectionDAG &DAG) {
21689 SDLoc DL(N);
21690
21691 SDValue Pred = N->getOperand(1);
21692 SDValue VecToReduce = N->getOperand(2);
21693
21694 // NOTE: The integer reduction's result type is not always linked to the
21695 // operand's element type so we construct it from the intrinsic's result type.
21696 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
21697 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21698
21699 // SVE reductions set the whole vector register with the first element
21700 // containing the reduction result, which we'll now extract.
21701 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21702 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21703 Zero);
21704}
21705
21707 SelectionDAG &DAG) {
21708 SDLoc DL(N);
21709
21710 SDValue Pred = N->getOperand(1);
21711 SDValue VecToReduce = N->getOperand(2);
21712
21713 EVT ReduceVT = VecToReduce.getValueType();
21714 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21715
21716 // SVE reductions set the whole vector register with the first element
21717 // containing the reduction result, which we'll now extract.
21718 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21719 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21720 Zero);
21721}
21722
21724 SelectionDAG &DAG) {
21725 SDLoc DL(N);
21726
21727 SDValue Pred = N->getOperand(1);
21728 SDValue InitVal = N->getOperand(2);
21729 SDValue VecToReduce = N->getOperand(3);
21730 EVT ReduceVT = VecToReduce.getValueType();
21731
21732 // Ordered reductions use the first lane of the result vector as the
21733 // reduction's initial value.
21734 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21735 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
21736 DAG.getUNDEF(ReduceVT), InitVal, Zero);
21737
21738 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
21739
21740 // SVE reductions set the whole vector register with the first element
21741 // containing the reduction result, which we'll now extract.
21742 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21743 Zero);
21744}
21745
21746// If a merged operation has no inactive lanes we can relax it to a predicated
21747// or unpredicated operation, which potentially allows better isel (perhaps
21748// using immediate forms) or relaxing register reuse requirements.
21750 SelectionDAG &DAG, bool UnpredOp = false,
21751 bool SwapOperands = false) {
21752 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21753 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
21754 SDValue Pg = N->getOperand(1);
21755 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
21756 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
21757
21758 // ISD way to specify an all active predicate.
21759 if (isAllActivePredicate(DAG, Pg)) {
21760 if (UnpredOp)
21761 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
21762
21763 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
21764 }
21765
21766 // FUTURE: SplatVector(true)
21767 return SDValue();
21768}
21769
21772 const AArch64Subtarget *Subtarget) {
21773 if (DCI.isBeforeLegalize())
21774 return SDValue();
21775
21776 if (!Subtarget->hasSVE2p1())
21777 return SDValue();
21778
21779 if (!N->hasNUsesOfValue(2, 0))
21780 return SDValue();
21781
21782 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
21783 if (HalfSize < 2)
21784 return SDValue();
21785
21786 auto It = N->user_begin();
21787 SDNode *Lo = *It++;
21788 SDNode *Hi = *It;
21789
21790 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
21791 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
21792 return SDValue();
21793
21794 uint64_t OffLo = Lo->getConstantOperandVal(1);
21795 uint64_t OffHi = Hi->getConstantOperandVal(1);
21796
21797 if (OffLo > OffHi) {
21798 std::swap(Lo, Hi);
21799 std::swap(OffLo, OffHi);
21800 }
21801
21802 if (OffLo != 0 || OffHi != HalfSize)
21803 return SDValue();
21804
21805 EVT HalfVec = Lo->getValueType(0);
21806 if (HalfVec != Hi->getValueType(0) ||
21807 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
21808 return SDValue();
21809
21810 SelectionDAG &DAG = DCI.DAG;
21811 SDLoc DL(N);
21812 SDValue ID =
21813 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
21814 SDValue Idx = N->getOperand(1);
21815 SDValue TC = N->getOperand(2);
21816 if (Idx.getValueType() != MVT::i64) {
21817 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
21818 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
21819 }
21820 auto R =
21822 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
21823
21824 DCI.CombineTo(Lo, R.getValue(0));
21825 DCI.CombineTo(Hi, R.getValue(1));
21826
21827 return SDValue(N, 0);
21828}
21829
21831 const AArch64Subtarget *Subtarget,
21832 SelectionDAG &DAG) {
21833
21834 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21835 getIntrinsicID(N) ==
21836 Intrinsic::experimental_vector_partial_reduce_add &&
21837 "Expected a partial reduction node");
21838
21839 bool Scalable = N->getValueType(0).isScalableVector();
21840 if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
21841 return SDValue();
21842 if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
21843 return SDValue();
21844
21845 SDLoc DL(N);
21846
21847 SDValue Op2 = N->getOperand(2);
21848 unsigned Op2Opcode = Op2->getOpcode();
21849 SDValue MulOpLHS, MulOpRHS;
21850 bool MulOpLHSIsSigned, MulOpRHSIsSigned;
21851 if (ISD::isExtOpcode(Op2Opcode)) {
21852 MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
21853 MulOpLHS = Op2->getOperand(0);
21854 MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
21855 } else if (Op2Opcode == ISD::MUL) {
21856 SDValue ExtMulOpLHS = Op2->getOperand(0);
21857 SDValue ExtMulOpRHS = Op2->getOperand(1);
21858
21859 unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
21860 unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
21861 if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
21862 !ISD::isExtOpcode(ExtMulOpRHSOpcode))
21863 return SDValue();
21864
21865 MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
21866 MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
21867
21868 MulOpLHS = ExtMulOpLHS->getOperand(0);
21869 MulOpRHS = ExtMulOpRHS->getOperand(0);
21870
21871 if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
21872 return SDValue();
21873 } else
21874 return SDValue();
21875
21876 SDValue Acc = N->getOperand(1);
21877 EVT ReducedVT = N->getValueType(0);
21878 EVT MulSrcVT = MulOpLHS.getValueType();
21879
21880 // Dot products operate on chunks of four elements so there must be four times
21881 // as many elements in the wide type
21882 if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
21883 !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
21884 !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
21885 !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
21886 !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
21887 !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
21888 return SDValue();
21889
21890 // If the extensions are mixed, we should lower it to a usdot instead
21891 unsigned Opcode = 0;
21892 if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
21893 if (!Subtarget->hasMatMulInt8())
21894 return SDValue();
21895
21896 bool Scalable = N->getValueType(0).isScalableVT();
21897 // There's no nxv2i64 version of usdot
21898 if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
21899 return SDValue();
21900
21901 Opcode = AArch64ISD::USDOT;
21902 // USDOT expects the signed operand to be last
21903 if (!MulOpRHSIsSigned)
21904 std::swap(MulOpLHS, MulOpRHS);
21905 } else
21906 Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
21907
21908 // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
21909 // product followed by a zero / sign extension
21910 if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
21911 (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
21912 EVT ReducedVTI32 =
21913 (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
21914
21915 SDValue DotI32 =
21916 DAG.getNode(Opcode, DL, ReducedVTI32,
21917 DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
21918 SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
21919 return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
21920 }
21921
21922 return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
21923}
21924
21926 const AArch64Subtarget *Subtarget,
21927 SelectionDAG &DAG) {
21928
21929 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21930 getIntrinsicID(N) ==
21931 Intrinsic::experimental_vector_partial_reduce_add &&
21932 "Expected a partial reduction node");
21933
21934 if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
21935 return SDValue();
21936
21937 SDLoc DL(N);
21938
21939 if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
21940 return SDValue();
21941 SDValue Acc = N->getOperand(1);
21942 SDValue Ext = N->getOperand(2);
21943 EVT AccVT = Acc.getValueType();
21944 EVT ExtVT = Ext.getValueType();
21945 if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
21946 return SDValue();
21947
21948 SDValue ExtOp = Ext->getOperand(0);
21949 EVT ExtOpVT = ExtOp.getValueType();
21950
21951 if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
21952 !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
21953 !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
21954 return SDValue();
21955
21956 bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
21957 unsigned BottomOpcode =
21958 ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
21959 unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
21960 SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
21961 return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
21962}
21963
21966 const AArch64Subtarget *Subtarget) {
21967 SelectionDAG &DAG = DCI.DAG;
21968 unsigned IID = getIntrinsicID(N);
21969 switch (IID) {
21970 default:
21971 break;
21972 case Intrinsic::experimental_vector_partial_reduce_add: {
21973 if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
21974 return Dot;
21975 if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
21976 return WideAdd;
21977 return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
21978 N->getOperand(1), N->getOperand(2));
21979 }
21980 case Intrinsic::aarch64_neon_vcvtfxs2fp:
21981 case Intrinsic::aarch64_neon_vcvtfxu2fp:
21982 return tryCombineFixedPointConvert(N, DCI, DAG);
21983 case Intrinsic::aarch64_neon_saddv:
21985 case Intrinsic::aarch64_neon_uaddv:
21987 case Intrinsic::aarch64_neon_sminv:
21989 case Intrinsic::aarch64_neon_uminv:
21991 case Intrinsic::aarch64_neon_smaxv:
21993 case Intrinsic::aarch64_neon_umaxv:
21995 case Intrinsic::aarch64_neon_fmax:
21996 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
21997 N->getOperand(1), N->getOperand(2));
21998 case Intrinsic::aarch64_neon_fmin:
21999 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22000 N->getOperand(1), N->getOperand(2));
22001 case Intrinsic::aarch64_neon_fmaxnm:
22002 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22003 N->getOperand(1), N->getOperand(2));
22004 case Intrinsic::aarch64_neon_fminnm:
22005 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22006 N->getOperand(1), N->getOperand(2));
22007 case Intrinsic::aarch64_neon_smull:
22008 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22009 N->getOperand(1), N->getOperand(2));
22010 case Intrinsic::aarch64_neon_umull:
22011 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22012 N->getOperand(1), N->getOperand(2));
22013 case Intrinsic::aarch64_neon_pmull:
22014 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22015 N->getOperand(1), N->getOperand(2));
22016 case Intrinsic::aarch64_neon_sqdmull:
22017 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22018 case Intrinsic::aarch64_neon_sqshl:
22019 case Intrinsic::aarch64_neon_uqshl:
22020 case Intrinsic::aarch64_neon_sqshlu:
22021 case Intrinsic::aarch64_neon_srshl:
22022 case Intrinsic::aarch64_neon_urshl:
22023 case Intrinsic::aarch64_neon_sshl:
22024 case Intrinsic::aarch64_neon_ushl:
22025 return tryCombineShiftImm(IID, N, DAG);
22026 case Intrinsic::aarch64_neon_sabd:
22027 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22028 N->getOperand(1), N->getOperand(2));
22029 case Intrinsic::aarch64_neon_uabd:
22030 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22031 N->getOperand(1), N->getOperand(2));
22032 case Intrinsic::aarch64_crc32b:
22033 case Intrinsic::aarch64_crc32cb:
22034 return tryCombineCRC32(0xff, N, DAG);
22035 case Intrinsic::aarch64_crc32h:
22036 case Intrinsic::aarch64_crc32ch:
22037 return tryCombineCRC32(0xffff, N, DAG);
22038 case Intrinsic::aarch64_sve_saddv:
22039 // There is no i64 version of SADDV because the sign is irrelevant.
22040 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
22042 else
22044 case Intrinsic::aarch64_sve_uaddv:
22046 case Intrinsic::aarch64_sve_smaxv:
22048 case Intrinsic::aarch64_sve_umaxv:
22050 case Intrinsic::aarch64_sve_sminv:
22052 case Intrinsic::aarch64_sve_uminv:
22054 case Intrinsic::aarch64_sve_orv:
22056 case Intrinsic::aarch64_sve_eorv:
22058 case Intrinsic::aarch64_sve_andv:
22060 case Intrinsic::aarch64_sve_index:
22061 return LowerSVEIntrinsicIndex(N, DAG);
22062 case Intrinsic::aarch64_sve_dup:
22063 return LowerSVEIntrinsicDUP(N, DAG);
22064 case Intrinsic::aarch64_sve_dup_x:
22065 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22066 N->getOperand(1));
22067 case Intrinsic::aarch64_sve_ext:
22068 return LowerSVEIntrinsicEXT(N, DAG);
22069 case Intrinsic::aarch64_sve_mul_u:
22070 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22071 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22072 case Intrinsic::aarch64_sve_smulh_u:
22073 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22074 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22075 case Intrinsic::aarch64_sve_umulh_u:
22076 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22077 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22078 case Intrinsic::aarch64_sve_smin_u:
22079 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22080 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22081 case Intrinsic::aarch64_sve_umin_u:
22082 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22083 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22084 case Intrinsic::aarch64_sve_smax_u:
22085 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22086 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22087 case Intrinsic::aarch64_sve_umax_u:
22088 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22089 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22090 case Intrinsic::aarch64_sve_lsl_u:
22091 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22092 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22093 case Intrinsic::aarch64_sve_lsr_u:
22094 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22095 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22096 case Intrinsic::aarch64_sve_asr_u:
22097 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22098 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22099 case Intrinsic::aarch64_sve_fadd_u:
22100 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22101 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22102 case Intrinsic::aarch64_sve_fdiv_u:
22103 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22104 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22105 case Intrinsic::aarch64_sve_fmax_u:
22106 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22107 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22108 case Intrinsic::aarch64_sve_fmaxnm_u:
22109 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22110 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22111 case Intrinsic::aarch64_sve_fmla_u:
22112 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22113 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22114 N->getOperand(2));
22115 case Intrinsic::aarch64_sve_fmin_u:
22116 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22117 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22118 case Intrinsic::aarch64_sve_fminnm_u:
22119 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22120 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22121 case Intrinsic::aarch64_sve_fmul_u:
22122 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22123 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22124 case Intrinsic::aarch64_sve_fsub_u:
22125 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22126 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22127 case Intrinsic::aarch64_sve_add_u:
22128 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22129 N->getOperand(3));
22130 case Intrinsic::aarch64_sve_sub_u:
22131 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22132 N->getOperand(3));
22133 case Intrinsic::aarch64_sve_subr:
22134 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22135 case Intrinsic::aarch64_sve_and_u:
22136 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22137 N->getOperand(3));
22138 case Intrinsic::aarch64_sve_bic_u:
22139 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22140 N->getOperand(2), N->getOperand(3));
22141 case Intrinsic::aarch64_sve_saddwb:
22142 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22143 N->getOperand(1), N->getOperand(2));
22144 case Intrinsic::aarch64_sve_saddwt:
22145 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22146 N->getOperand(1), N->getOperand(2));
22147 case Intrinsic::aarch64_sve_uaddwb:
22148 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22149 N->getOperand(1), N->getOperand(2));
22150 case Intrinsic::aarch64_sve_uaddwt:
22151 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22152 N->getOperand(1), N->getOperand(2));
22153 case Intrinsic::aarch64_sve_eor_u:
22154 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22155 N->getOperand(3));
22156 case Intrinsic::aarch64_sve_orr_u:
22157 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22158 N->getOperand(3));
22159 case Intrinsic::aarch64_sve_sabd_u:
22160 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22161 N->getOperand(2), N->getOperand(3));
22162 case Intrinsic::aarch64_sve_uabd_u:
22163 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22164 N->getOperand(2), N->getOperand(3));
22165 case Intrinsic::aarch64_sve_sdiv_u:
22166 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22167 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22168 case Intrinsic::aarch64_sve_udiv_u:
22169 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22170 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22171 case Intrinsic::aarch64_sve_sqadd:
22172 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22173 case Intrinsic::aarch64_sve_sqsub_u:
22174 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22175 N->getOperand(2), N->getOperand(3));
22176 case Intrinsic::aarch64_sve_uqadd:
22177 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22178 case Intrinsic::aarch64_sve_uqsub_u:
22179 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22180 N->getOperand(2), N->getOperand(3));
22181 case Intrinsic::aarch64_sve_sqadd_x:
22182 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22183 N->getOperand(1), N->getOperand(2));
22184 case Intrinsic::aarch64_sve_sqsub_x:
22185 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22186 N->getOperand(1), N->getOperand(2));
22187 case Intrinsic::aarch64_sve_uqadd_x:
22188 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22189 N->getOperand(1), N->getOperand(2));
22190 case Intrinsic::aarch64_sve_uqsub_x:
22191 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22192 N->getOperand(1), N->getOperand(2));
22193 case Intrinsic::aarch64_sve_asrd:
22194 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22195 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22196 case Intrinsic::aarch64_sve_cmphs:
22197 if (!N->getOperand(2).getValueType().isFloatingPoint())
22199 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22200 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22201 break;
22202 case Intrinsic::aarch64_sve_cmphi:
22203 if (!N->getOperand(2).getValueType().isFloatingPoint())
22205 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22206 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22207 break;
22208 case Intrinsic::aarch64_sve_fcmpge:
22209 case Intrinsic::aarch64_sve_cmpge:
22211 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22212 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22213 break;
22214 case Intrinsic::aarch64_sve_fcmpgt:
22215 case Intrinsic::aarch64_sve_cmpgt:
22217 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22218 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22219 break;
22220 case Intrinsic::aarch64_sve_fcmpeq:
22221 case Intrinsic::aarch64_sve_cmpeq:
22223 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22224 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22225 break;
22226 case Intrinsic::aarch64_sve_fcmpne:
22227 case Intrinsic::aarch64_sve_cmpne:
22229 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22230 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22231 break;
22232 case Intrinsic::aarch64_sve_fcmpuo:
22234 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22235 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22236 break;
22237 case Intrinsic::aarch64_sve_fadda:
22239 case Intrinsic::aarch64_sve_faddv:
22241 case Intrinsic::aarch64_sve_fmaxnmv:
22243 case Intrinsic::aarch64_sve_fmaxv:
22245 case Intrinsic::aarch64_sve_fminnmv:
22247 case Intrinsic::aarch64_sve_fminv:
22249 case Intrinsic::aarch64_sve_sel:
22250 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22251 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22252 case Intrinsic::aarch64_sve_cmpeq_wide:
22253 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22254 case Intrinsic::aarch64_sve_cmpne_wide:
22255 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22256 case Intrinsic::aarch64_sve_cmpge_wide:
22257 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22258 case Intrinsic::aarch64_sve_cmpgt_wide:
22259 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22260 case Intrinsic::aarch64_sve_cmplt_wide:
22261 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22262 case Intrinsic::aarch64_sve_cmple_wide:
22263 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22264 case Intrinsic::aarch64_sve_cmphs_wide:
22265 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22266 case Intrinsic::aarch64_sve_cmphi_wide:
22267 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22268 case Intrinsic::aarch64_sve_cmplo_wide:
22269 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22270 case Intrinsic::aarch64_sve_cmpls_wide:
22271 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22272 case Intrinsic::aarch64_sve_ptest_any:
22273 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22275 case Intrinsic::aarch64_sve_ptest_first:
22276 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22278 case Intrinsic::aarch64_sve_ptest_last:
22279 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22281 case Intrinsic::aarch64_sve_whilelo:
22282 return tryCombineWhileLo(N, DCI, Subtarget);
22283 }
22284 return SDValue();
22285}
22286
22287static bool isCheapToExtend(const SDValue &N) {
22288 unsigned OC = N->getOpcode();
22289 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22291}
22292
22293static SDValue
22295 SelectionDAG &DAG) {
22296 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22297 // we can move the sext into the arguments and have the same result. For
22298 // example, if A and B are both loads, we can make those extending loads and
22299 // avoid an extra instruction. This pattern appears often in VLS code
22300 // generation where the inputs to the setcc have a different size to the
22301 // instruction that wants to use the result of the setcc.
22302 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22303 N->getOperand(0)->getOpcode() == ISD::SETCC);
22304 const SDValue SetCC = N->getOperand(0);
22305
22306 const SDValue CCOp0 = SetCC.getOperand(0);
22307 const SDValue CCOp1 = SetCC.getOperand(1);
22308 if (!CCOp0->getValueType(0).isInteger() ||
22309 !CCOp1->getValueType(0).isInteger())
22310 return SDValue();
22311
22312 ISD::CondCode Code =
22313 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22314
22315 ISD::NodeType ExtType =
22316 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22317
22318 if (isCheapToExtend(SetCC.getOperand(0)) &&
22319 isCheapToExtend(SetCC.getOperand(1))) {
22320 const SDValue Ext1 =
22321 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22322 const SDValue Ext2 =
22323 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
22324
22325 return DAG.getSetCC(
22326 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
22327 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
22328 }
22329
22330 return SDValue();
22331}
22332
22333// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22334// This comes from interleaved vectorization. It is performed late to capture
22335// uitofp converts too.
22337 SelectionDAG &DAG) {
22338 EVT VT = N->getValueType(0);
22339 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
22340 N->getOpcode() != ISD::ZERO_EXTEND ||
22341 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22342 return SDValue();
22343
22344 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
22345 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22346 return SDValue();
22347
22348 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
22349 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
22350 if (!Shuffle ||
22351 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
22352 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
22353 return SDValue();
22354
22355 unsigned Idx;
22357 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
22358 // An undef interleave shuffle can come up after other canonicalizations,
22359 // where the shuffle has been converted to
22360 // zext(extract(shuffle b, undef, [u,u,0,4]))
22361 bool IsUndefDeInterleave = false;
22362 if (!IsDeInterleave)
22363 IsUndefDeInterleave =
22364 Shuffle->getOperand(1).isUndef() &&
22366 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
22367 VT.getVectorNumElements() / 2),
22368 4, Idx);
22369 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
22370 return SDValue();
22371 SDLoc DL(N);
22372 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22373 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
22374 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22375 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
22377 VT, BC1, BC2);
22378 if ((Idx & 1) == 1)
22379 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
22380 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
22381 return DAG.getNode(
22382 ISD::AND, DL, VT, UZP,
22383 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
22384}
22385
22386// This comes up similar to the above when lowering deinterleaving shuffles from
22387// zexts. We have legalized the operations in the generally case to
22388// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22389// the extract is to the low half and the uzp is uzp1. There would be an extra
22390// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22391// there could also be an existing and / shift that can be combined in, either
22392// before of after the extract.
22394 EVT VT = N->getValueType(0);
22395 if (N->getOpcode() != ISD::ZERO_EXTEND ||
22396 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22397 return SDValue();
22398
22399 SDValue Op = N->getOperand(0);
22400 unsigned ExtOffset = (unsigned)-1;
22401 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22402 ExtOffset = Op.getConstantOperandVal(1);
22403 Op = Op.getOperand(0);
22404 }
22405
22406 unsigned Shift = 0;
22408 Op.getValueType().getScalarSizeInBits());
22409
22410 if (Op.getOpcode() == AArch64ISD::VLSHR) {
22411 Shift = Op.getConstantOperandVal(1);
22412 Op = Op.getOperand(0);
22413 Mask = Mask.lshr(Shift);
22414 }
22415 if (Op.getOpcode() == ISD::AND &&
22416 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
22417 Op = Op.getOperand(0);
22418 Mask = Mask.zext(VT.getScalarSizeInBits());
22419 } else if (Op.getOpcode() == AArch64ISD::BICi) {
22420 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
22421 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
22422 Mask = Mask.zext(VT.getScalarSizeInBits());
22423 Op = Op.getOperand(0);
22424 }
22425
22426 if (ExtOffset == (unsigned)-1) {
22427 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22428 ExtOffset = Op.getConstantOperandVal(1);
22429 Op = Op.getOperand(0);
22430 } else
22431 return SDValue();
22432 }
22433 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
22434 return SDValue();
22435
22436 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
22437 return SDValue();
22438 if (Op.getOpcode() == AArch64ISD::UZP2)
22439 Shift += VT.getScalarSizeInBits() / 2;
22440
22441 SDLoc DL(N);
22442 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
22443 Op.getOperand(ExtOffset == 0 ? 0 : 1));
22444 if (Shift != 0)
22445 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
22446 DAG.getConstant(Shift, DL, MVT::i32));
22447 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
22448}
22449
22452 SelectionDAG &DAG) {
22453 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
22454 // we can convert that DUP into another extract_high (of a bigger DUP), which
22455 // helps the backend to decide that an sabdl2 would be useful, saving a real
22456 // extract_high operation.
22457 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
22458 N->getOperand(0).getValueType().is64BitVector() &&
22459 (N->getOperand(0).getOpcode() == ISD::ABDU ||
22460 N->getOperand(0).getOpcode() == ISD::ABDS)) {
22461 SDNode *ABDNode = N->getOperand(0).getNode();
22462 SDValue NewABD =
22464 if (!NewABD.getNode())
22465 return SDValue();
22466
22467 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
22468 }
22469
22471 return R;
22472 if (SDValue R = performZExtUZPCombine(N, DAG))
22473 return R;
22474
22475 if (N->getValueType(0).isFixedLengthVector() &&
22476 N->getOpcode() == ISD::SIGN_EXTEND &&
22477 N->getOperand(0)->getOpcode() == ISD::SETCC)
22478 return performSignExtendSetCCCombine(N, DCI, DAG);
22479
22480 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22481 // that the top half of the result register must be unused, due to the
22482 // any_extend. This means that we can replace this pattern with (rev16
22483 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22484 // ...)), which is what this pattern would otherwise be lowered to.
22485 // Only apply this optimisation if any_extend in original pattern to i32 or
22486 // i64, because this type will become the input type to REV16 in the new
22487 // pattern, so must be a legitimate REV16 input type.
22488 SDValue Bswap = N->getOperand(0);
22489 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22490 Bswap.getValueType() == MVT::i16 &&
22491 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
22492 SDLoc DL(N);
22493 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
22494 Bswap->getOperand(0));
22495 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
22496 NewAnyExtend);
22497 }
22498
22499 return SDValue();
22500}
22501
22503 SDValue SplatVal, unsigned NumVecElts) {
22504 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
22505 Align OrigAlignment = St.getAlign();
22506 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
22507
22508 // Create scalar stores. This is at least as good as the code sequence for a
22509 // split unaligned store which is a dup.s, ext.b, and two stores.
22510 // Most of the time the three stores should be replaced by store pair
22511 // instructions (stp).
22512 SDLoc DL(&St);
22513 SDValue BasePtr = St.getBasePtr();
22514 uint64_t BaseOffset = 0;
22515
22516 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
22517 SDValue NewST1 =
22518 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
22519 OrigAlignment, St.getMemOperand()->getFlags());
22520
22521 // As this in ISel, we will not merge this add which may degrade results.
22522 if (BasePtr->getOpcode() == ISD::ADD &&
22523 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
22524 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
22525 BasePtr = BasePtr->getOperand(0);
22526 }
22527
22528 unsigned Offset = EltOffset;
22529 while (--NumVecElts) {
22530 Align Alignment = commonAlignment(OrigAlignment, Offset);
22531 SDValue OffsetPtr =
22532 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22533 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
22534 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
22535 PtrInfo.getWithOffset(Offset), Alignment,
22536 St.getMemOperand()->getFlags());
22537 Offset += EltOffset;
22538 }
22539 return NewST1;
22540}
22541
22542// Returns an SVE type that ContentTy can be trivially sign or zero extended
22543// into.
22544static MVT getSVEContainerType(EVT ContentTy) {
22545 assert(ContentTy.isSimple() && "No SVE containers for extended types");
22546
22547 switch (ContentTy.getSimpleVT().SimpleTy) {
22548 default:
22549 llvm_unreachable("No known SVE container for this MVT type");
22550 case MVT::nxv2i8:
22551 case MVT::nxv2i16:
22552 case MVT::nxv2i32:
22553 case MVT::nxv2i64:
22554 case MVT::nxv2f32:
22555 case MVT::nxv2f64:
22556 return MVT::nxv2i64;
22557 case MVT::nxv4i8:
22558 case MVT::nxv4i16:
22559 case MVT::nxv4i32:
22560 case MVT::nxv4f32:
22561 return MVT::nxv4i32;
22562 case MVT::nxv8i8:
22563 case MVT::nxv8i16:
22564 case MVT::nxv8f16:
22565 case MVT::nxv8bf16:
22566 return MVT::nxv8i16;
22567 case MVT::nxv16i8:
22568 return MVT::nxv16i8;
22569 }
22570}
22571
22572static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
22573 SDLoc DL(N);
22574 EVT VT = N->getValueType(0);
22575
22577 return SDValue();
22578
22579 EVT ContainerVT = VT;
22580 if (ContainerVT.isInteger())
22581 ContainerVT = getSVEContainerType(ContainerVT);
22582
22583 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
22584 SDValue Ops[] = { N->getOperand(0), // Chain
22585 N->getOperand(2), // Pg
22586 N->getOperand(3), // Base
22587 DAG.getValueType(VT) };
22588
22589 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
22590 SDValue LoadChain = SDValue(Load.getNode(), 1);
22591
22592 if (ContainerVT.isInteger() && (VT != ContainerVT))
22593 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
22594
22595 return DAG.getMergeValues({ Load, LoadChain }, DL);
22596}
22597
22599 SDLoc DL(N);
22600 EVT VT = N->getValueType(0);
22601 EVT PtrTy = N->getOperand(3).getValueType();
22602
22603 EVT LoadVT = VT;
22604 if (VT.isFloatingPoint())
22605 LoadVT = VT.changeTypeToInteger();
22606
22607 auto *MINode = cast<MemIntrinsicSDNode>(N);
22608 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
22609 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
22610 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
22611 MINode->getOperand(2), PassThru,
22612 MINode->getMemoryVT(), MINode->getMemOperand(),
22614
22615 if (VT.isFloatingPoint()) {
22616 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
22617 return DAG.getMergeValues(Ops, DL);
22618 }
22619
22620 return L;
22621}
22622
22623template <unsigned Opcode>
22625 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
22627 "Unsupported opcode.");
22628 SDLoc DL(N);
22629 EVT VT = N->getValueType(0);
22630
22631 EVT LoadVT = VT;
22632 if (VT.isFloatingPoint())
22633 LoadVT = VT.changeTypeToInteger();
22634
22635 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
22636 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
22637 SDValue LoadChain = SDValue(Load.getNode(), 1);
22638
22639 if (VT.isFloatingPoint())
22640 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
22641
22642 return DAG.getMergeValues({Load, LoadChain}, DL);
22643}
22644
22646 SDLoc DL(N);
22647 SDValue Data = N->getOperand(2);
22648 EVT DataVT = Data.getValueType();
22649 EVT HwSrcVt = getSVEContainerType(DataVT);
22650 SDValue InputVT = DAG.getValueType(DataVT);
22651
22652 if (DataVT.isFloatingPoint())
22653 InputVT = DAG.getValueType(HwSrcVt);
22654
22655 SDValue SrcNew;
22656 if (Data.getValueType().isFloatingPoint())
22657 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
22658 else
22659 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
22660
22661 SDValue Ops[] = { N->getOperand(0), // Chain
22662 SrcNew,
22663 N->getOperand(4), // Base
22664 N->getOperand(3), // Pg
22665 InputVT
22666 };
22667
22668 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
22669}
22670
22672 SDLoc DL(N);
22673
22674 SDValue Data = N->getOperand(2);
22675 EVT DataVT = Data.getValueType();
22676 EVT PtrTy = N->getOperand(4).getValueType();
22677
22678 if (DataVT.isFloatingPoint())
22680
22681 auto *MINode = cast<MemIntrinsicSDNode>(N);
22682 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
22683 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
22684 MINode->getMemoryVT(), MINode->getMemOperand(),
22685 ISD::UNINDEXED, false, false);
22686}
22687
22688/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
22689/// load store optimizer pass will merge them to store pair stores. This should
22690/// be better than a movi to create the vector zero followed by a vector store
22691/// if the zero constant is not re-used, since one instructions and one register
22692/// live range will be removed.
22693///
22694/// For example, the final generated code should be:
22695///
22696/// stp xzr, xzr, [x0]
22697///
22698/// instead of:
22699///
22700/// movi v0.2d, #0
22701/// str q0, [x0]
22702///
22704 SDValue StVal = St.getValue();
22705 EVT VT = StVal.getValueType();
22706
22707 // Avoid scalarizing zero splat stores for scalable vectors.
22708 if (VT.isScalableVector())
22709 return SDValue();
22710
22711 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
22712 // 2, 3 or 4 i32 elements.
22713 int NumVecElts = VT.getVectorNumElements();
22714 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
22715 VT.getVectorElementType().getSizeInBits() == 64) ||
22716 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
22717 VT.getVectorElementType().getSizeInBits() == 32)))
22718 return SDValue();
22719
22720 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
22721 return SDValue();
22722
22723 // If the zero constant has more than one use then the vector store could be
22724 // better since the constant mov will be amortized and stp q instructions
22725 // should be able to be formed.
22726 if (!StVal.hasOneUse())
22727 return SDValue();
22728
22729 // If the store is truncating then it's going down to i16 or smaller, which
22730 // means it can be implemented in a single store anyway.
22731 if (St.isTruncatingStore())
22732 return SDValue();
22733
22734 // If the immediate offset of the address operand is too large for the stp
22735 // instruction, then bail out.
22736 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
22737 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
22738 if (Offset < -512 || Offset > 504)
22739 return SDValue();
22740 }
22741
22742 for (int I = 0; I < NumVecElts; ++I) {
22743 SDValue EltVal = StVal.getOperand(I);
22744 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
22745 return SDValue();
22746 }
22747
22748 // Use a CopyFromReg WZR/XZR here to prevent
22749 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
22750 SDLoc DL(&St);
22751 unsigned ZeroReg;
22752 EVT ZeroVT;
22753 if (VT.getVectorElementType().getSizeInBits() == 32) {
22754 ZeroReg = AArch64::WZR;
22755 ZeroVT = MVT::i32;
22756 } else {
22757 ZeroReg = AArch64::XZR;
22758 ZeroVT = MVT::i64;
22759 }
22760 SDValue SplatVal =
22761 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
22762 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22763}
22764
22765/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
22766/// value. The load store optimizer pass will merge them to store pair stores.
22767/// This has better performance than a splat of the scalar followed by a split
22768/// vector store. Even if the stores are not merged it is four stores vs a dup,
22769/// followed by an ext.b and two stores.
22771 SDValue StVal = St.getValue();
22772 EVT VT = StVal.getValueType();
22773
22774 // Don't replace floating point stores, they possibly won't be transformed to
22775 // stp because of the store pair suppress pass.
22776 if (VT.isFloatingPoint())
22777 return SDValue();
22778
22779 // We can express a splat as store pair(s) for 2 or 4 elements.
22780 unsigned NumVecElts = VT.getVectorNumElements();
22781 if (NumVecElts != 4 && NumVecElts != 2)
22782 return SDValue();
22783
22784 // If the store is truncating then it's going down to i16 or smaller, which
22785 // means it can be implemented in a single store anyway.
22786 if (St.isTruncatingStore())
22787 return SDValue();
22788
22789 // Check that this is a splat.
22790 // Make sure that each of the relevant vector element locations are inserted
22791 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
22792 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
22793 SDValue SplatVal;
22794 for (unsigned I = 0; I < NumVecElts; ++I) {
22795 // Check for insert vector elements.
22796 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
22797 return SDValue();
22798
22799 // Check that same value is inserted at each vector element.
22800 if (I == 0)
22801 SplatVal = StVal.getOperand(1);
22802 else if (StVal.getOperand(1) != SplatVal)
22803 return SDValue();
22804
22805 // Check insert element index.
22806 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
22807 if (!CIndex)
22808 return SDValue();
22809 uint64_t IndexVal = CIndex->getZExtValue();
22810 if (IndexVal >= NumVecElts)
22811 return SDValue();
22812 IndexNotInserted.reset(IndexVal);
22813
22814 StVal = StVal.getOperand(0);
22815 }
22816 // Check that all vector element locations were inserted to.
22817 if (IndexNotInserted.any())
22818 return SDValue();
22819
22820 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22821}
22822
22824 SelectionDAG &DAG,
22825 const AArch64Subtarget *Subtarget) {
22826
22827 StoreSDNode *S = cast<StoreSDNode>(N);
22828 if (S->isVolatile() || S->isIndexed())
22829 return SDValue();
22830
22831 SDValue StVal = S->getValue();
22832 EVT VT = StVal.getValueType();
22833
22834 if (!VT.isFixedLengthVector())
22835 return SDValue();
22836
22837 // If we get a splat of zeros, convert this vector store to a store of
22838 // scalars. They will be merged into store pairs of xzr thereby removing one
22839 // instruction and one register.
22840 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
22841 return ReplacedZeroSplat;
22842
22843 // FIXME: The logic for deciding if an unaligned store should be split should
22844 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
22845 // a call to that function here.
22846
22847 if (!Subtarget->isMisaligned128StoreSlow())
22848 return SDValue();
22849
22850 // Don't split at -Oz.
22852 return SDValue();
22853
22854 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
22855 // those up regresses performance on micro-benchmarks and olden/bh.
22856 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
22857 return SDValue();
22858
22859 // Split unaligned 16B stores. They are terrible for performance.
22860 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
22861 // extensions can use this to mark that it does not want splitting to happen
22862 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
22863 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
22864 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
22865 S->getAlign() <= Align(2))
22866 return SDValue();
22867
22868 // If we get a splat of a scalar convert this vector store to a store of
22869 // scalars. They will be merged into store pairs thereby removing two
22870 // instructions.
22871 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
22872 return ReplacedSplat;
22873
22874 SDLoc DL(S);
22875
22876 // Split VT into two.
22877 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
22878 unsigned NumElts = HalfVT.getVectorNumElements();
22879 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
22880 DAG.getConstant(0, DL, MVT::i64));
22881 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
22882 DAG.getConstant(NumElts, DL, MVT::i64));
22883 SDValue BasePtr = S->getBasePtr();
22884 SDValue NewST1 =
22885 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
22886 S->getAlign(), S->getMemOperand()->getFlags());
22887 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
22888 DAG.getConstant(8, DL, MVT::i64));
22889 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
22890 S->getPointerInfo(), S->getAlign(),
22891 S->getMemOperand()->getFlags());
22892}
22893
22895 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
22896
22897 // splice(pg, op1, undef) -> op1
22898 if (N->getOperand(2).isUndef())
22899 return N->getOperand(1);
22900
22901 return SDValue();
22902}
22903
22905 const AArch64Subtarget *Subtarget) {
22906 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
22907 N->getOpcode() == AArch64ISD::UUNPKLO) &&
22908 "Unexpected Opcode!");
22909
22910 // uunpklo/hi undef -> undef
22911 if (N->getOperand(0).isUndef())
22912 return DAG.getUNDEF(N->getValueType(0));
22913
22914 // If this is a masked load followed by an UUNPKLO, fold this into a masked
22915 // extending load. We can do this even if this is already a masked
22916 // {z,}extload.
22917 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
22918 N->getOpcode() == AArch64ISD::UUNPKLO) {
22919 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
22920 SDValue Mask = MLD->getMask();
22921 SDLoc DL(N);
22922
22923 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22924 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22925 (MLD->getPassThru()->isUndef() ||
22926 isZerosVector(MLD->getPassThru().getNode()))) {
22927 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22928 unsigned PgPattern = Mask->getConstantOperandVal(0);
22929 EVT VT = N->getValueType(0);
22930
22931 // Ensure we can double the size of the predicate pattern
22932 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22933 if (NumElts &&
22934 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22935 Mask =
22936 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
22937 SDValue PassThru = DAG.getConstant(0, DL, VT);
22938 SDValue NewLoad = DAG.getMaskedLoad(
22939 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
22940 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
22942
22943 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
22944
22945 return NewLoad;
22946 }
22947 }
22948 }
22949
22950 return SDValue();
22951}
22952
22954 if (N->getOpcode() != AArch64ISD::UZP1)
22955 return false;
22956 SDValue Op0 = N->getOperand(0);
22957 EVT SrcVT = Op0->getValueType(0);
22958 EVT DstVT = N->getValueType(0);
22959 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
22960 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
22961 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
22962}
22963
22964// Try to combine rounding shifts where the operands come from an extend, and
22965// the result is truncated and combined into one vector.
22966// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
22968 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
22969 SDValue Op0 = N->getOperand(0);
22970 SDValue Op1 = N->getOperand(1);
22971 EVT ResVT = N->getValueType(0);
22972
22973 unsigned RshOpc = Op0.getOpcode();
22974 if (RshOpc != AArch64ISD::RSHRNB_I)
22975 return SDValue();
22976
22977 // Same op code and imm value?
22978 SDValue ShiftValue = Op0.getOperand(1);
22979 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
22980 return SDValue();
22981
22982 // Same unextended operand value?
22983 SDValue Lo = Op0.getOperand(0);
22984 SDValue Hi = Op1.getOperand(0);
22985 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
22986 Hi.getOpcode() != AArch64ISD::UUNPKHI)
22987 return SDValue();
22988 SDValue OrigArg = Lo.getOperand(0);
22989 if (OrigArg != Hi.getOperand(0))
22990 return SDValue();
22991
22992 SDLoc DL(N);
22993 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
22994 getPredicateForVector(DAG, DL, ResVT), OrigArg,
22995 ShiftValue);
22996}
22997
22998// Try to simplify:
22999// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23000// t2 = nxv8i16 srl(t1, ShiftValue)
23001// to
23002// t1 = nxv8i16 rshrnb(X, shiftvalue).
23003// rshrnb will zero the top half bits of each element. Therefore, this combine
23004// should only be performed when a following instruction with the rshrnb
23005// as an operand does not care about the top half of each element. For example,
23006// a uzp1 or a truncating store.
23008 const AArch64Subtarget *Subtarget) {
23009 EVT VT = Srl->getValueType(0);
23010 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23011 return SDValue();
23012
23013 EVT ResVT;
23014 if (VT == MVT::nxv8i16)
23015 ResVT = MVT::nxv16i8;
23016 else if (VT == MVT::nxv4i32)
23017 ResVT = MVT::nxv8i16;
23018 else if (VT == MVT::nxv2i64)
23019 ResVT = MVT::nxv4i32;
23020 else
23021 return SDValue();
23022
23023 SDLoc DL(Srl);
23024 unsigned ShiftValue;
23025 SDValue RShOperand;
23026 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23027 return SDValue();
23028 SDValue Rshrnb = DAG.getNode(
23029 AArch64ISD::RSHRNB_I, DL, ResVT,
23030 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23031 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23032}
23033
23035 if (V.getOpcode() != AArch64ISD::NVCAST)
23036 return SDValue();
23037
23038 SDValue Op = V.getOperand(0);
23039 if (V.getValueType().getVectorElementCount() !=
23040 Op.getValueType().getVectorElementCount() * 2)
23041 return SDValue();
23042
23043 return Op;
23044}
23045
23047 const AArch64Subtarget *Subtarget) {
23048 SDLoc DL(N);
23049 SDValue Op0 = N->getOperand(0);
23050 SDValue Op1 = N->getOperand(1);
23051 EVT ResVT = N->getValueType(0);
23052
23053 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23054 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23056 Op0.getOperand(0) == Op1.getOperand(0)) {
23057
23058 SDValue SourceVec = Op0.getOperand(0);
23059 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23060 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23061 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23062 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23063 EVT OpVT = Op0.getOperand(1).getValueType();
23064 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23065 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23066 DAG.getUNDEF(WidenedResVT));
23067 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23068 DAG.getConstant(0, DL, OpVT));
23069 }
23070 }
23071
23072 // Following optimizations only work with uzp1.
23073 if (N->getOpcode() == AArch64ISD::UZP2)
23074 return SDValue();
23075
23076 // uzp1(x, undef) -> concat(truncate(x), undef)
23077 if (Op1.getOpcode() == ISD::UNDEF) {
23078 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23079 switch (ResVT.getSimpleVT().SimpleTy) {
23080 default:
23081 break;
23082 case MVT::v16i8:
23083 BCVT = MVT::v8i16;
23084 HalfVT = MVT::v8i8;
23085 break;
23086 case MVT::v8i16:
23087 BCVT = MVT::v4i32;
23088 HalfVT = MVT::v4i16;
23089 break;
23090 case MVT::v4i32:
23091 BCVT = MVT::v2i64;
23092 HalfVT = MVT::v2i32;
23093 break;
23094 }
23095 if (BCVT != MVT::Other) {
23096 SDValue BC = DAG.getBitcast(BCVT, Op0);
23097 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23098 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23099 DAG.getUNDEF(HalfVT));
23100 }
23101 }
23102
23103 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23104 return Urshr;
23105
23106 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23107 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23108 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23109 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23110 }
23111 }
23112
23113 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23114 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23115 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23116 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23117 }
23118 }
23119
23120 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23121 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23122 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23123 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23124 SDValue X = PreCast.getOperand(0).getOperand(0);
23125 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23126 }
23127 }
23128 }
23129
23130 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23131 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23132 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23133 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23134 SDValue Z = PreCast.getOperand(0).getOperand(1);
23135 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23136 }
23137 }
23138 }
23139
23140 // These optimizations only work on little endian.
23141 if (!DAG.getDataLayout().isLittleEndian())
23142 return SDValue();
23143
23144 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23145 // Example:
23146 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23147 // to
23148 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23150 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23151 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23152 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23153 Op1.getOperand(0));
23154 }
23155 }
23156
23157 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23158 return SDValue();
23159
23160 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23161 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23162
23163 // truncating uzp1(x, y) -> xtn(concat (x, y))
23164 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23165 EVT Op0Ty = SourceOp0.getValueType();
23166 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23167 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23168 SDValue Concat =
23171 SourceOp0, SourceOp1);
23172 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23173 }
23174 }
23175
23176 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23177 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23178 SourceOp1.getOpcode() != ISD::TRUNCATE)
23179 return SDValue();
23180 SourceOp0 = SourceOp0.getOperand(0);
23181 SourceOp1 = SourceOp1.getOperand(0);
23182
23183 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23184 !SourceOp0.getValueType().isSimple())
23185 return SDValue();
23186
23187 EVT ResultTy;
23188
23189 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23190 case MVT::v2i64:
23191 ResultTy = MVT::v4i32;
23192 break;
23193 case MVT::v4i32:
23194 ResultTy = MVT::v8i16;
23195 break;
23196 case MVT::v8i16:
23197 ResultTy = MVT::v16i8;
23198 break;
23199 default:
23200 return SDValue();
23201 }
23202
23203 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23204 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23205 SDValue UzpResult =
23206 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23207
23208 EVT BitcastResultTy;
23209
23210 switch (ResVT.getSimpleVT().SimpleTy) {
23211 case MVT::v2i32:
23212 BitcastResultTy = MVT::v2i64;
23213 break;
23214 case MVT::v4i16:
23215 BitcastResultTy = MVT::v4i32;
23216 break;
23217 case MVT::v8i8:
23218 BitcastResultTy = MVT::v8i16;
23219 break;
23220 default:
23221 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23222 }
23223
23224 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23225 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23226}
23227
23229 unsigned Opc = N->getOpcode();
23230
23231 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
23233 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
23235 "Invalid opcode.");
23236
23237 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23239 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23241 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23245
23246 SDLoc DL(N);
23247 SDValue Chain = N->getOperand(0);
23248 SDValue Pg = N->getOperand(1);
23249 SDValue Base = N->getOperand(2);
23250 SDValue Offset = N->getOperand(3);
23251 SDValue Ty = N->getOperand(4);
23252
23253 EVT ResVT = N->getValueType(0);
23254
23255 const auto OffsetOpc = Offset.getOpcode();
23256 const bool OffsetIsZExt =
23258 const bool OffsetIsSExt =
23260
23261 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23262 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23263 SDValue ExtPg = Offset.getOperand(0);
23264 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23265 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23266
23267 // If the predicate for the sign- or zero-extended offset is the
23268 // same as the predicate used for this load and the sign-/zero-extension
23269 // was from a 32-bits...
23270 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23271 SDValue UnextendedOffset = Offset.getOperand(1);
23272
23273 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23274 if (Signed)
23275 NewOpc = getSignExtendedGatherOpcode(NewOpc);
23276
23277 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23278 {Chain, Pg, Base, UnextendedOffset, Ty});
23279 }
23280 }
23281
23282 return SDValue();
23283}
23284
23285/// Optimize a vector shift instruction and its operand if shifted out
23286/// bits are not used.
23288 const AArch64TargetLowering &TLI,
23290 assert(N->getOpcode() == AArch64ISD::VASHR ||
23291 N->getOpcode() == AArch64ISD::VLSHR);
23292
23293 SDValue Op = N->getOperand(0);
23294 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23295
23296 unsigned ShiftImm = N->getConstantOperandVal(1);
23297 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23298
23299 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23300 if (N->getOpcode() == AArch64ISD::VASHR &&
23301 Op.getOpcode() == AArch64ISD::VSHL &&
23302 N->getOperand(1) == Op.getOperand(1))
23303 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23304 return Op.getOperand(0);
23305
23306 // If the shift is exact, the shifted out bits matter.
23307 if (N->getFlags().hasExact())
23308 return SDValue();
23309
23310 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23311 APInt DemandedMask = ~ShiftedOutBits;
23312
23313 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23314 return SDValue(N, 0);
23315
23316 return SDValue();
23317}
23318
23320 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23321 // This transform works in partnership with performSetCCPunpkCombine to
23322 // remove unnecessary transfer of predicates into standard registers and back
23323 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23324 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23325 MVT::i1) {
23326 SDValue CC = N->getOperand(0)->getOperand(0);
23327 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
23328 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
23329 DAG.getVectorIdxConstant(0, SDLoc(N)));
23330 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
23331 }
23332
23333 return SDValue();
23334}
23335
23336/// Target-specific DAG combine function for post-increment LD1 (lane) and
23337/// post-increment LD1R.
23340 bool IsLaneOp) {
23341 if (DCI.isBeforeLegalizeOps())
23342 return SDValue();
23343
23344 SelectionDAG &DAG = DCI.DAG;
23345 EVT VT = N->getValueType(0);
23346
23347 if (!VT.is128BitVector() && !VT.is64BitVector())
23348 return SDValue();
23349
23350 // If it is not LOAD, can not do such combine.
23351 unsigned LoadIdx = IsLaneOp ? 1 : 0;
23352 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
23353 if (!LD)
23354 return SDValue();
23355
23356 // If the Generic combiner already helped form a pre- or post-indexed load,
23357 // skip forming one here.
23358 if (LD->isIndexed())
23359 return SDValue();
23360
23361 // The vector lane must be a constant in the LD1LANE opcode.
23362 SDValue Lane;
23363 if (IsLaneOp) {
23364 Lane = N->getOperand(2);
23365 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
23366 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
23367 return SDValue();
23368 }
23369
23370 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
23371 EVT MemVT = LoadSDN->getMemoryVT();
23372 // Check if memory operand is the same type as the vector element.
23373 if (MemVT != VT.getVectorElementType())
23374 return SDValue();
23375
23376 // Check if there are other uses. If so, do not combine as it will introduce
23377 // an extra load.
23378 for (SDUse &U : LD->uses()) {
23379 if (U.getResNo() == 1) // Ignore uses of the chain result.
23380 continue;
23381 if (U.getUser() != N)
23382 return SDValue();
23383 }
23384
23385 // If there is one use and it can splat the value, prefer that operation.
23386 // TODO: This could be expanded to more operations if they reliably use the
23387 // index variants.
23388 if (N->hasOneUse()) {
23389 unsigned UseOpc = N->user_begin()->getOpcode();
23390 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
23391 return SDValue();
23392 }
23393
23394 SDValue Addr = LD->getOperand(1);
23395 SDValue Vector = N->getOperand(0);
23396 // Search for a use of the address operand that is an increment.
23397 for (SDUse &Use : Addr->uses()) {
23398 SDNode *User = Use.getUser();
23399 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
23400 continue;
23401
23402 // If the increment is a constant, it must match the memory ref size.
23403 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23404 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23405 uint32_t IncVal = CInc->getZExtValue();
23406 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
23407 if (IncVal != NumBytes)
23408 continue;
23409 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23410 }
23411
23412 // To avoid cycle construction make sure that neither the load nor the add
23413 // are predecessors to each other or the Vector.
23416 Visited.insert(Addr.getNode());
23417 Worklist.push_back(User);
23418 Worklist.push_back(LD);
23419 Worklist.push_back(Vector.getNode());
23420 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
23421 SDNode::hasPredecessorHelper(User, Visited, Worklist))
23422 continue;
23423
23425 Ops.push_back(LD->getOperand(0)); // Chain
23426 if (IsLaneOp) {
23427 Ops.push_back(Vector); // The vector to be inserted
23428 Ops.push_back(Lane); // The lane to be inserted in the vector
23429 }
23430 Ops.push_back(Addr);
23431 Ops.push_back(Inc);
23432
23433 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
23434 SDVTList SDTys = DAG.getVTList(Tys);
23435 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
23436 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
23437 MemVT,
23438 LoadSDN->getMemOperand());
23439
23440 // Update the uses.
23441 SDValue NewResults[] = {
23442 SDValue(LD, 0), // The result of load
23443 SDValue(UpdN.getNode(), 2) // Chain
23444 };
23445 DCI.CombineTo(LD, NewResults);
23446 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
23447 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
23448
23449 break;
23450 }
23451 return SDValue();
23452}
23453
23454/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
23455/// address translation.
23458 SelectionDAG &DAG) {
23459 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
23460 KnownBits Known;
23462 !DCI.isBeforeLegalizeOps());
23463 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23464 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
23465 DCI.CommitTargetLoweringOpt(TLO);
23466 return true;
23467 }
23468 return false;
23469}
23470
23472 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
23473 "Expected STORE dag node in input!");
23474
23475 if (auto Store = dyn_cast<StoreSDNode>(N)) {
23476 if (!Store->isTruncatingStore() || Store->isIndexed())
23477 return SDValue();
23478 SDValue Ext = Store->getValue();
23479 auto ExtOpCode = Ext.getOpcode();
23480 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
23481 ExtOpCode != ISD::ANY_EXTEND)
23482 return SDValue();
23483 SDValue Orig = Ext->getOperand(0);
23484 if (Store->getMemoryVT() != Orig.getValueType())
23485 return SDValue();
23486 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
23487 Store->getBasePtr(), Store->getMemOperand());
23488 }
23489
23490 return SDValue();
23491}
23492
23493// A custom combine to lower load <3 x i8> as the more efficient sequence
23494// below:
23495// ldrb wX, [x0, #2]
23496// ldrh wY, [x0]
23497// orr wX, wY, wX, lsl #16
23498// fmov s0, wX
23499//
23500// Note that an alternative sequence with even fewer (although usually more
23501// complex/expensive) instructions would be:
23502// ld1r.4h { v0 }, [x0], #2
23503// ld1.b { v0 }[2], [x0]
23504//
23505// Generating this sequence unfortunately results in noticeably worse codegen
23506// for code that extends the loaded v3i8, due to legalization breaking vector
23507// shuffle detection in a way that is very difficult to work around.
23508// TODO: Revisit once v3i8 legalization has been improved in general.
23510 EVT MemVT = LD->getMemoryVT();
23511 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
23512 LD->getOriginalAlign() >= 4)
23513 return SDValue();
23514
23515 SDLoc DL(LD);
23517 SDValue Chain = LD->getChain();
23518 SDValue BasePtr = LD->getBasePtr();
23519 MachineMemOperand *MMO = LD->getMemOperand();
23520 assert(LD->getOffset().isUndef() && "undef offset expected");
23521
23522 // Load 2 x i8, then 1 x i8.
23523 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
23524 TypeSize Offset2 = TypeSize::getFixed(2);
23525 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
23526 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
23527 MF.getMachineMemOperand(MMO, 2, 1));
23528
23529 // Extend to i32.
23530 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
23531 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
23532
23533 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
23534 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
23535 DAG.getConstant(16, DL, MVT::i32));
23536 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
23537 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
23538
23539 // Extract v3i8 again.
23540 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
23541 DAG.getConstant(0, DL, MVT::i64));
23542 SDValue TokenFactor = DAG.getNode(
23543 ISD::TokenFactor, DL, MVT::Other,
23544 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
23545 return DAG.getMergeValues({Extract, TokenFactor}, DL);
23546}
23547
23548// Perform TBI simplification if supported by the target and try to break up
23549// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23550// load instructions can be selected.
23553 SelectionDAG &DAG,
23554 const AArch64Subtarget *Subtarget) {
23555 if (Subtarget->supportsAddressTopByteIgnored())
23556 performTBISimplification(N->getOperand(1), DCI, DAG);
23557
23558 LoadSDNode *LD = cast<LoadSDNode>(N);
23559 if (LD->isVolatile() || !Subtarget->isLittleEndian())
23560 return SDValue(N, 0);
23561
23562 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
23563 return Res;
23564
23565 if (!LD->isNonTemporal())
23566 return SDValue(N, 0);
23567
23568 EVT MemVT = LD->getMemoryVT();
23569 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
23570 MemVT.getSizeInBits() % 256 == 0 ||
23571 256 % MemVT.getScalarSizeInBits() != 0)
23572 return SDValue(N, 0);
23573
23574 SDLoc DL(LD);
23575 SDValue Chain = LD->getChain();
23576 SDValue BasePtr = LD->getBasePtr();
23577 SDNodeFlags Flags = LD->getFlags();
23579 SmallVector<SDValue, 4> LoadOpsChain;
23580 // Replace any non temporal load over 256-bit with a series of 256 bit loads
23581 // and a scalar/vector load less than 256. This way we can utilize 256-bit
23582 // loads and reduce the amount of load instructions generated.
23583 MVT NewVT =
23585 256 / MemVT.getVectorElementType().getSizeInBits());
23586 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
23587 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
23588 for (unsigned I = 0; I < Num256Loads; I++) {
23589 unsigned PtrOffset = I * 32;
23590 SDValue NewPtr = DAG.getMemBasePlusOffset(
23591 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
23592 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
23593 SDValue NewLoad = DAG.getLoad(
23594 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
23595 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
23596 LoadOps.push_back(NewLoad);
23597 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
23598 }
23599
23600 // Process remaining bits of the load operation.
23601 // This is done by creating an UNDEF vector to match the size of the
23602 // 256-bit loads and inserting the remaining load to it. We extract the
23603 // original load type at the end using EXTRACT_SUBVECTOR instruction.
23604 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
23605 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
23606 MVT RemainingVT = MVT::getVectorVT(
23608 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
23609 SDValue NewPtr = DAG.getMemBasePlusOffset(
23610 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
23611 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
23612 SDValue RemainingLoad =
23613 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
23614 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
23615 LD->getMemOperand()->getFlags(), LD->getAAInfo());
23616 SDValue UndefVector = DAG.getUNDEF(NewVT);
23617 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
23618 SDValue ExtendedReminingLoad =
23619 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
23620 {UndefVector, RemainingLoad, InsertIdx});
23621 LoadOps.push_back(ExtendedReminingLoad);
23622 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
23623 EVT ConcatVT =
23625 LoadOps.size() * NewVT.getVectorNumElements());
23626 SDValue ConcatVectors =
23627 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
23628 // Extract the original vector type size.
23629 SDValue ExtractSubVector =
23630 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
23631 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
23632 SDValue TokenFactor =
23633 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
23634 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
23635}
23636
23638 EVT VecVT = Op.getValueType();
23639 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
23640 "Need boolean vector type.");
23641
23642 if (Depth > 3)
23644
23645 // We can get the base type from a vector compare or truncate.
23646 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
23647 return Op.getOperand(0).getValueType();
23648
23649 // If an operand is a bool vector, continue looking.
23651 for (SDValue Operand : Op->op_values()) {
23652 if (Operand.getValueType() != VecVT)
23653 continue;
23654
23655 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
23656 if (!BaseVT.isSimple())
23657 BaseVT = OperandVT;
23658 else if (OperandVT != BaseVT)
23660 }
23661
23662 return BaseVT;
23663}
23664
23665// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
23666// iN, we can use a trick that extracts the i^th bit from the i^th element and
23667// then performs a vector add to get a scalar bitmask. This requires that each
23668// element's bits are either all 1 or all 0.
23670 SDLoc DL(N);
23671 SDValue ComparisonResult(N, 0);
23672 EVT VecVT = ComparisonResult.getValueType();
23673 assert(VecVT.isVector() && "Must be a vector type");
23674
23675 unsigned NumElts = VecVT.getVectorNumElements();
23676 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
23677 return SDValue();
23678
23679 if (VecVT.getVectorElementType() != MVT::i1 &&
23680 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
23681 return SDValue();
23682
23683 // If we can find the original types to work on instead of a vector of i1,
23684 // we can avoid extend/extract conversion instructions.
23685 if (VecVT.getVectorElementType() == MVT::i1) {
23686 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
23687 if (!VecVT.isSimple()) {
23688 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
23689 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
23690 }
23691 }
23692 VecVT = VecVT.changeVectorElementTypeToInteger();
23693
23694 // Large vectors don't map directly to this conversion, so to avoid too many
23695 // edge cases, we don't apply it here. The conversion will likely still be
23696 // applied later via multiple smaller vectors, whose results are concatenated.
23697 if (VecVT.getSizeInBits() > 128)
23698 return SDValue();
23699
23700 // Ensure that all elements' bits are either 0s or 1s.
23701 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
23702
23703 SmallVector<SDValue, 16> MaskConstants;
23705 VecVT == MVT::v16i8) {
23706 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
23707 // per entry. We split it into two halves, apply the mask, zip the halves to
23708 // create 8x 16-bit values, and the perform the vector reduce.
23709 for (unsigned Half = 0; Half < 2; ++Half) {
23710 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
23711 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
23712 }
23713 }
23714 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
23715 SDValue RepresentativeBits =
23716 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
23717
23718 SDValue UpperRepresentativeBits =
23719 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
23720 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
23721 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
23722 RepresentativeBits, UpperRepresentativeBits);
23723 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
23724 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
23725 }
23726
23727 // All other vector sizes.
23728 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
23729 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
23730 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
23731 }
23732
23733 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
23734 SDValue RepresentativeBits =
23735 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
23736 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
23737 NumElts, VecVT.getVectorElementType().getSizeInBits()));
23738 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
23739}
23740
23742 StoreSDNode *Store) {
23743 if (!Store->isTruncatingStore())
23744 return SDValue();
23745
23746 SDLoc DL(Store);
23747 SDValue VecOp = Store->getValue();
23748 EVT VT = VecOp.getValueType();
23749 EVT MemVT = Store->getMemoryVT();
23750
23751 if (!MemVT.isVector() || !VT.isVector() ||
23752 MemVT.getVectorElementType() != MVT::i1)
23753 return SDValue();
23754
23755 // If we are storing a vector that we are currently building, let
23756 // `scalarizeVectorStore()` handle this more efficiently.
23757 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
23758 return SDValue();
23759
23760 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
23761 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
23762 if (!VectorBits)
23763 return SDValue();
23764
23765 EVT StoreVT =
23767 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
23768 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
23769 Store->getMemOperand());
23770}
23771
23773 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
23774 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
23775 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
23776}
23777
23778// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
23780 const AArch64Subtarget *Subtarget) {
23781 SDValue Value = ST->getValue();
23782 EVT ValueVT = Value.getValueType();
23783
23784 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23785 Value.getOpcode() != ISD::TRUNCATE ||
23786 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
23787 return SDValue();
23788
23789 assert(ST->getOffset().isUndef() && "undef offset expected");
23790 SDLoc DL(ST);
23791 auto WideVT = EVT::getVectorVT(
23792 *DAG.getContext(),
23793 Value->getOperand(0).getValueType().getVectorElementType(), 4);
23794 SDValue UndefVector = DAG.getUNDEF(WideVT);
23795 SDValue WideTrunc = DAG.getNode(
23796 ISD::INSERT_SUBVECTOR, DL, WideVT,
23797 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
23798 SDValue Cast = DAG.getNode(
23799 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
23800 WideTrunc);
23801
23803 SDValue Chain = ST->getChain();
23804 MachineMemOperand *MMO = ST->getMemOperand();
23805 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
23806 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23807 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
23808 TypeSize Offset2 = TypeSize::getFixed(2);
23809 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
23810 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
23811
23812 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23813 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
23814 TypeSize Offset1 = TypeSize::getFixed(1);
23815 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
23816 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
23817
23818 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
23819 DAG.getConstant(0, DL, MVT::i64));
23820 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
23821 MF.getMachineMemOperand(MMO, 0, 1));
23822 return Chain;
23823}
23824
23827 SelectionDAG &DAG,
23828 const AArch64Subtarget *Subtarget) {
23829 StoreSDNode *ST = cast<StoreSDNode>(N);
23830 SDValue Chain = ST->getChain();
23831 SDValue Value = ST->getValue();
23832 SDValue Ptr = ST->getBasePtr();
23833 EVT ValueVT = Value.getValueType();
23834
23835 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
23836 EVT EltVT = VT.getVectorElementType();
23837 return EltVT == MVT::f32 || EltVT == MVT::f64;
23838 };
23839
23840 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
23841 return Res;
23842
23843 // If this is an FP_ROUND followed by a store, fold this into a truncating
23844 // store. We can do this even if this is already a truncstore.
23845 // We purposefully don't care about legality of the nodes here as we know
23846 // they can be split down into something legal.
23847 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
23848 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
23849 Subtarget->useSVEForFixedLengthVectors() &&
23850 ValueVT.isFixedLengthVector() &&
23851 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
23852 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
23853 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
23854 ST->getMemoryVT(), ST->getMemOperand());
23855
23856 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
23857 return Split;
23858
23859 if (Subtarget->supportsAddressTopByteIgnored() &&
23860 performTBISimplification(N->getOperand(2), DCI, DAG))
23861 return SDValue(N, 0);
23862
23863 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
23864 return Store;
23865
23866 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
23867 return Store;
23868
23869 if (ST->isTruncatingStore()) {
23870 EVT StoreVT = ST->getMemoryVT();
23871 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
23872 return SDValue();
23873 if (SDValue Rshrnb =
23874 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
23875 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
23876 StoreVT, ST->getMemOperand());
23877 }
23878 }
23879
23880 return SDValue();
23881}
23882
23885 SelectionDAG &DAG,
23886 const AArch64Subtarget *Subtarget) {
23887 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
23888 SDValue Value = MST->getValue();
23889 SDValue Mask = MST->getMask();
23890 SDLoc DL(N);
23891
23892 // If this is a UZP1 followed by a masked store, fold this into a masked
23893 // truncating store. We can do this even if this is already a masked
23894 // truncstore.
23895 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
23896 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23897 Value.getValueType().isInteger()) {
23898 Value = Value.getOperand(0);
23899 if (Value.getOpcode() == ISD::BITCAST) {
23900 EVT HalfVT =
23901 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
23902 EVT InVT = Value.getOperand(0).getValueType();
23903
23904 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
23905 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23906 unsigned PgPattern = Mask->getConstantOperandVal(0);
23907
23908 // Ensure we can double the size of the predicate pattern
23909 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23910 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
23911 MinSVESize) {
23912 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
23913 PgPattern);
23914 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
23915 MST->getBasePtr(), MST->getOffset(), Mask,
23916 MST->getMemoryVT(), MST->getMemOperand(),
23917 MST->getAddressingMode(),
23918 /*IsTruncating=*/true);
23919 }
23920 }
23921 }
23922 }
23923
23924 if (MST->isTruncatingStore()) {
23925 EVT ValueVT = Value->getValueType(0);
23926 EVT MemVT = MST->getMemoryVT();
23927 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
23928 return SDValue();
23929 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
23930 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
23931 MST->getOffset(), MST->getMask(),
23932 MST->getMemoryVT(), MST->getMemOperand(),
23933 MST->getAddressingMode(), true);
23934 }
23935 }
23936
23937 return SDValue();
23938}
23939
23940/// \return true if part of the index was folded into the Base.
23941static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
23942 SDLoc DL, SelectionDAG &DAG) {
23943 // This function assumes a vector of i64 indices.
23944 EVT IndexVT = Index.getValueType();
23945 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
23946 return false;
23947
23948 // Simplify:
23949 // BasePtr = Ptr
23950 // Index = X + splat(Offset)
23951 // ->
23952 // BasePtr = Ptr + Offset * scale.
23953 // Index = X
23954 if (Index.getOpcode() == ISD::ADD) {
23955 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
23956 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
23957 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
23958 Index = Index.getOperand(0);
23959 return true;
23960 }
23961 }
23962
23963 // Simplify:
23964 // BasePtr = Ptr
23965 // Index = (X + splat(Offset)) << splat(Shift)
23966 // ->
23967 // BasePtr = Ptr + (Offset << Shift) * scale)
23968 // Index = X << splat(shift)
23969 if (Index.getOpcode() == ISD::SHL &&
23970 Index.getOperand(0).getOpcode() == ISD::ADD) {
23971 SDValue Add = Index.getOperand(0);
23972 SDValue ShiftOp = Index.getOperand(1);
23973 SDValue OffsetOp = Add.getOperand(1);
23974 if (auto Shift = DAG.getSplatValue(ShiftOp))
23975 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
23976 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
23977 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
23978 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
23979 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
23980 Add.getOperand(0), ShiftOp);
23981 return true;
23982 }
23983 }
23984
23985 return false;
23986}
23987
23988// Analyse the specified address returning true if a more optimal addressing
23989// mode is available. When returning true all parameters are updated to reflect
23990// their recommended values.
23992 SDValue &BasePtr, SDValue &Index,
23993 SelectionDAG &DAG) {
23994 // Try to iteratively fold parts of the index into the base pointer to
23995 // simplify the index as much as possible.
23996 bool Changed = false;
23997 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
23998 Changed = true;
23999
24000 // Only consider element types that are pointer sized as smaller types can
24001 // be easily promoted.
24002 EVT IndexVT = Index.getValueType();
24003 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24004 return Changed;
24005
24006 // Can indices be trivially shrunk?
24007 EVT DataVT = N->getOperand(1).getValueType();
24008 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24009 // will later be re-extended to 64 bits in legalization
24010 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24011 return Changed;
24012 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24013 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24014 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24015 return true;
24016 }
24017
24018 // Match:
24019 // Index = step(const)
24020 int64_t Stride = 0;
24021 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24022 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24023 }
24024 // Match:
24025 // Index = step(const) << shift(const)
24026 else if (Index.getOpcode() == ISD::SHL &&
24027 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24028 SDValue RHS = Index.getOperand(1);
24029 if (auto *Shift =
24030 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
24031 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24032 Stride = Step << Shift->getZExtValue();
24033 }
24034 }
24035
24036 // Return early because no supported pattern is found.
24037 if (Stride == 0)
24038 return Changed;
24039
24040 if (Stride < std::numeric_limits<int32_t>::min() ||
24041 Stride > std::numeric_limits<int32_t>::max())
24042 return Changed;
24043
24044 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24045 unsigned MaxVScale =
24047 int64_t LastElementOffset =
24048 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24049
24050 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
24051 LastElementOffset > std::numeric_limits<int32_t>::max())
24052 return Changed;
24053
24054 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24055 // Stride does not scale explicitly by 'Scale', because it happens in
24056 // the gather/scatter addressing mode.
24057 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
24058 return true;
24059}
24060
24063 if (!DCI.isBeforeLegalize())
24064 return SDValue();
24065 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
24066
24067 SDLoc DL(MGS);
24068 SDValue Chain = MGS->getChain();
24069 SDValue Scale = MGS->getScale();
24070 SDValue Index = MGS->getIndex();
24071 SDValue Mask = MGS->getMask();
24072 SDValue BasePtr = MGS->getBasePtr();
24073 ISD::MemIndexType IndexType = MGS->getIndexType();
24074
24075 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
24076 return SDValue();
24077
24078 // Here we catch such cases early and change MGATHER's IndexType to allow
24079 // the use of an Index that's more legalisation friendly.
24080 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
24081 SDValue PassThru = MGT->getPassThru();
24082 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24083 return DAG.getMaskedGather(
24084 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
24085 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
24086 }
24087 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
24088 SDValue Data = MSC->getValue();
24089 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24090 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
24091 DL, Ops, MSC->getMemOperand(), IndexType,
24092 MSC->isTruncatingStore());
24093 }
24094 auto *HG = cast<MaskedHistogramSDNode>(MGS);
24095 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24096 Index, Scale, HG->getIntID()};
24097 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
24098 DL, Ops, HG->getMemOperand(), IndexType);
24099}
24100
24101/// Target-specific DAG combine function for NEON load/store intrinsics
24102/// to merge base address updates.
24105 SelectionDAG &DAG) {
24106 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24107 return SDValue();
24108
24109 unsigned AddrOpIdx = N->getNumOperands() - 1;
24110 SDValue Addr = N->getOperand(AddrOpIdx);
24111
24112 // Search for a use of the address operand that is an increment.
24113 for (SDUse &Use : Addr->uses()) {
24114 SDNode *User = Use.getUser();
24115 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24116 continue;
24117
24118 // Check that the add is independent of the load/store. Otherwise, folding
24119 // it would create a cycle.
24122 Visited.insert(Addr.getNode());
24123 Worklist.push_back(N);
24124 Worklist.push_back(User);
24125 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
24126 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24127 continue;
24128
24129 // Find the new opcode for the updating load/store.
24130 bool IsStore = false;
24131 bool IsLaneOp = false;
24132 bool IsDupOp = false;
24133 unsigned NewOpc = 0;
24134 unsigned NumVecs = 0;
24135 unsigned IntNo = N->getConstantOperandVal(1);
24136 switch (IntNo) {
24137 default: llvm_unreachable("unexpected intrinsic for Neon base update");
24138 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
24139 NumVecs = 2; break;
24140 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
24141 NumVecs = 3; break;
24142 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
24143 NumVecs = 4; break;
24144 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
24145 NumVecs = 2; IsStore = true; break;
24146 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
24147 NumVecs = 3; IsStore = true; break;
24148 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
24149 NumVecs = 4; IsStore = true; break;
24150 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
24151 NumVecs = 2; break;
24152 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
24153 NumVecs = 3; break;
24154 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
24155 NumVecs = 4; break;
24156 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
24157 NumVecs = 2; IsStore = true; break;
24158 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
24159 NumVecs = 3; IsStore = true; break;
24160 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
24161 NumVecs = 4; IsStore = true; break;
24162 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
24163 NumVecs = 2; IsDupOp = true; break;
24164 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
24165 NumVecs = 3; IsDupOp = true; break;
24166 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
24167 NumVecs = 4; IsDupOp = true; break;
24168 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
24169 NumVecs = 2; IsLaneOp = true; break;
24170 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
24171 NumVecs = 3; IsLaneOp = true; break;
24172 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
24173 NumVecs = 4; IsLaneOp = true; break;
24174 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
24175 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
24176 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
24177 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
24178 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
24179 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
24180 }
24181
24182 EVT VecTy;
24183 if (IsStore)
24184 VecTy = N->getOperand(2).getValueType();
24185 else
24186 VecTy = N->getValueType(0);
24187
24188 // If the increment is a constant, it must match the memory ref size.
24189 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24190 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24191 uint32_t IncVal = CInc->getZExtValue();
24192 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
24193 if (IsLaneOp || IsDupOp)
24194 NumBytes /= VecTy.getVectorNumElements();
24195 if (IncVal != NumBytes)
24196 continue;
24197 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24198 }
24200 Ops.push_back(N->getOperand(0)); // Incoming chain
24201 // Load lane and store have vector list as input.
24202 if (IsLaneOp || IsStore)
24203 for (unsigned i = 2; i < AddrOpIdx; ++i)
24204 Ops.push_back(N->getOperand(i));
24205 Ops.push_back(Addr); // Base register
24206 Ops.push_back(Inc);
24207
24208 // Return Types.
24209 EVT Tys[6];
24210 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
24211 unsigned n;
24212 for (n = 0; n < NumResultVecs; ++n)
24213 Tys[n] = VecTy;
24214 Tys[n++] = MVT::i64; // Type of write back register
24215 Tys[n] = MVT::Other; // Type of the chain
24216 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
24217
24218 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
24219 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
24220 MemInt->getMemoryVT(),
24221 MemInt->getMemOperand());
24222
24223 // Update the uses.
24224 std::vector<SDValue> NewResults;
24225 for (unsigned i = 0; i < NumResultVecs; ++i) {
24226 NewResults.push_back(SDValue(UpdN.getNode(), i));
24227 }
24228 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
24229 DCI.CombineTo(N, NewResults);
24230 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
24231
24232 break;
24233 }
24234 return SDValue();
24235}
24236
24237// Checks to see if the value is the prescribed width and returns information
24238// about its extension mode.
24239static
24240bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24241 ExtType = ISD::NON_EXTLOAD;
24242 switch(V.getNode()->getOpcode()) {
24243 default:
24244 return false;
24245 case ISD::LOAD: {
24246 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
24247 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
24248 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
24249 ExtType = LoadNode->getExtensionType();
24250 return true;
24251 }
24252 return false;
24253 }
24254 case ISD::AssertSext: {
24255 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24256 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24257 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24258 ExtType = ISD::SEXTLOAD;
24259 return true;
24260 }
24261 return false;
24262 }
24263 case ISD::AssertZext: {
24264 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
24265 if ((TypeNode->getVT() == MVT::i8 && width == 8)
24266 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
24267 ExtType = ISD::ZEXTLOAD;
24268 return true;
24269 }
24270 return false;
24271 }
24272 case ISD::Constant:
24273 case ISD::TargetConstant: {
24274 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
24275 1LL << (width - 1);
24276 }
24277 }
24278
24279 return true;
24280}
24281
24282// This function does a whole lot of voodoo to determine if the tests are
24283// equivalent without and with a mask. Essentially what happens is that given a
24284// DAG resembling:
24285//
24286// +-------------+ +-------------+ +-------------+ +-------------+
24287// | Input | | AddConstant | | CompConstant| | CC |
24288// +-------------+ +-------------+ +-------------+ +-------------+
24289// | | | |
24290// V V | +----------+
24291// +-------------+ +----+ | |
24292// | ADD | |0xff| | |
24293// +-------------+ +----+ | |
24294// | | | |
24295// V V | |
24296// +-------------+ | |
24297// | AND | | |
24298// +-------------+ | |
24299// | | |
24300// +-----+ | |
24301// | | |
24302// V V V
24303// +-------------+
24304// | CMP |
24305// +-------------+
24306//
24307// The AND node may be safely removed for some combinations of inputs. In
24308// particular we need to take into account the extension type of the Input,
24309// the exact values of AddConstant, CompConstant, and CC, along with the nominal
24310// width of the input (this can work for any width inputs, the above graph is
24311// specific to 8 bits.
24312//
24313// The specific equations were worked out by generating output tables for each
24314// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
24315// problem was simplified by working with 4 bit inputs, which means we only
24316// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
24317// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
24318// patterns present in both extensions (0,7). For every distinct set of
24319// AddConstant and CompConstants bit patterns we can consider the masked and
24320// unmasked versions to be equivalent if the result of this function is true for
24321// all 16 distinct bit patterns of for the current extension type of Input (w0).
24322//
24323// sub w8, w0, w1
24324// and w10, w8, #0x0f
24325// cmp w8, w2
24326// cset w9, AArch64CC
24327// cmp w10, w2
24328// cset w11, AArch64CC
24329// cmp w9, w11
24330// cset w0, eq
24331// ret
24332//
24333// Since the above function shows when the outputs are equivalent it defines
24334// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
24335// would be expensive to run during compiles. The equations below were written
24336// in a test harness that confirmed they gave equivalent outputs to the above
24337// for all inputs function, so they can be used determine if the removal is
24338// legal instead.
24339//
24340// isEquivalentMaskless() is the code for testing if the AND can be removed
24341// factored out of the DAG recognition as the DAG can take several forms.
24342
24343static bool isEquivalentMaskless(unsigned CC, unsigned width,
24344 ISD::LoadExtType ExtType, int AddConstant,
24345 int CompConstant) {
24346 // By being careful about our equations and only writing the in term
24347 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
24348 // make them generally applicable to all bit widths.
24349 int MaxUInt = (1 << width);
24350
24351 // For the purposes of these comparisons sign extending the type is
24352 // equivalent to zero extending the add and displacing it by half the integer
24353 // width. Provided we are careful and make sure our equations are valid over
24354 // the whole range we can just adjust the input and avoid writing equations
24355 // for sign extended inputs.
24356 if (ExtType == ISD::SEXTLOAD)
24357 AddConstant -= (1 << (width-1));
24358
24359 switch(CC) {
24360 case AArch64CC::LE:
24361 case AArch64CC::GT:
24362 if ((AddConstant == 0) ||
24363 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
24364 (AddConstant >= 0 && CompConstant < 0) ||
24365 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
24366 return true;
24367 break;
24368 case AArch64CC::LT:
24369 case AArch64CC::GE:
24370 if ((AddConstant == 0) ||
24371 (AddConstant >= 0 && CompConstant <= 0) ||
24372 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
24373 return true;
24374 break;
24375 case AArch64CC::HI:
24376 case AArch64CC::LS:
24377 if ((AddConstant >= 0 && CompConstant < 0) ||
24378 (AddConstant <= 0 && CompConstant >= -1 &&
24379 CompConstant < AddConstant + MaxUInt))
24380 return true;
24381 break;
24382 case AArch64CC::PL:
24383 case AArch64CC::MI:
24384 if ((AddConstant == 0) ||
24385 (AddConstant > 0 && CompConstant <= 0) ||
24386 (AddConstant < 0 && CompConstant <= AddConstant))
24387 return true;
24388 break;
24389 case AArch64CC::LO:
24390 case AArch64CC::HS:
24391 if ((AddConstant >= 0 && CompConstant <= 0) ||
24392 (AddConstant <= 0 && CompConstant >= 0 &&
24393 CompConstant <= AddConstant + MaxUInt))
24394 return true;
24395 break;
24396 case AArch64CC::EQ:
24397 case AArch64CC::NE:
24398 if ((AddConstant > 0 && CompConstant < 0) ||
24399 (AddConstant < 0 && CompConstant >= 0 &&
24400 CompConstant < AddConstant + MaxUInt) ||
24401 (AddConstant >= 0 && CompConstant >= 0 &&
24402 CompConstant >= AddConstant) ||
24403 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
24404 return true;
24405 break;
24406 case AArch64CC::VS:
24407 case AArch64CC::VC:
24408 case AArch64CC::AL:
24409 case AArch64CC::NV:
24410 return true;
24411 case AArch64CC::Invalid:
24412 break;
24413 }
24414
24415 return false;
24416}
24417
24418// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
24419// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
24421 SDNode *AndNode, SelectionDAG &DAG,
24422 unsigned CCIndex, unsigned CmpIndex,
24423 unsigned CC) {
24424 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
24425 if (!SubsC)
24426 return SDValue();
24427
24428 APInt SubsAP = SubsC->getAPIntValue();
24429 if (CC == AArch64CC::HI) {
24430 if (!SubsAP.isMask())
24431 return SDValue();
24432 } else if (CC == AArch64CC::LO) {
24433 if (!SubsAP.isPowerOf2())
24434 return SDValue();
24435 } else
24436 return SDValue();
24437
24438 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
24439 if (!AndC)
24440 return SDValue();
24441
24442 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
24443
24444 SDLoc DL(N);
24445 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
24446 SDValue ANDS = DAG.getNode(
24447 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
24448 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
24449 SDValue AArch64_CC =
24451 N->getOperand(CCIndex)->getValueType(0));
24452
24453 // For now, only performCSELCombine and performBRCONDCombine call this
24454 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
24455 // operands. So just init the ops direct to simplify the code. If we have some
24456 // other case with different CCIndex, CmpIndex, we need to use for loop to
24457 // rewrite the code here.
24458 // TODO: Do we need to assert number of operand is 4 here?
24459 assert((CCIndex == 2 && CmpIndex == 3) &&
24460 "Expected CCIndex to be 2 and CmpIndex to be 3.");
24461 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
24462 ANDS.getValue(1)};
24463 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
24464}
24465
24466static
24469 SelectionDAG &DAG, unsigned CCIndex,
24470 unsigned CmpIndex) {
24471 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
24472 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
24473 unsigned CondOpcode = SubsNode->getOpcode();
24474
24475 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
24476 !SubsNode->hasOneUse())
24477 return SDValue();
24478
24479 // There is a SUBS feeding this condition. Is it fed by a mask we can
24480 // use?
24481
24482 SDNode *AndNode = SubsNode->getOperand(0).getNode();
24483 unsigned MaskBits = 0;
24484
24485 if (AndNode->getOpcode() != ISD::AND)
24486 return SDValue();
24487
24488 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
24489 CmpIndex, CC))
24490 return Val;
24491
24492 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
24493 uint32_t CNV = CN->getZExtValue();
24494 if (CNV == 255)
24495 MaskBits = 8;
24496 else if (CNV == 65535)
24497 MaskBits = 16;
24498 }
24499
24500 if (!MaskBits)
24501 return SDValue();
24502
24503 SDValue AddValue = AndNode->getOperand(0);
24504
24505 if (AddValue.getOpcode() != ISD::ADD)
24506 return SDValue();
24507
24508 // The basic dag structure is correct, grab the inputs and validate them.
24509
24510 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
24511 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
24512 SDValue SubsInputValue = SubsNode->getOperand(1);
24513
24514 // The mask is present and the provenance of all the values is a smaller type,
24515 // lets see if the mask is superfluous.
24516
24517 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
24518 !isa<ConstantSDNode>(SubsInputValue.getNode()))
24519 return SDValue();
24520
24521 ISD::LoadExtType ExtType;
24522
24523 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
24524 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
24525 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
24526 return SDValue();
24527
24528 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
24529 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
24530 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
24531 return SDValue();
24532
24533 // The AND is not necessary, remove it.
24534
24535 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
24536 SubsNode->getValueType(1));
24537 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
24538
24539 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
24540 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
24541
24542 return SDValue(N, 0);
24543}
24544
24545// Optimize compare with zero and branch.
24548 SelectionDAG &DAG) {
24550 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
24551 // will not be produced, as they are conditional branch instructions that do
24552 // not set flags.
24553 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
24554 return SDValue();
24555
24556 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
24557 N = NV.getNode();
24558 SDValue Chain = N->getOperand(0);
24559 SDValue Dest = N->getOperand(1);
24560 SDValue CCVal = N->getOperand(2);
24561 SDValue Cmp = N->getOperand(3);
24562
24563 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
24564 unsigned CC = CCVal->getAsZExtVal();
24565 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
24566 return SDValue();
24567
24568 unsigned CmpOpc = Cmp.getOpcode();
24569 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
24570 return SDValue();
24571
24572 // Only attempt folding if there is only one use of the flag and no use of the
24573 // value.
24574 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
24575 return SDValue();
24576
24577 SDValue LHS = Cmp.getOperand(0);
24578 SDValue RHS = Cmp.getOperand(1);
24579
24580 assert(LHS.getValueType() == RHS.getValueType() &&
24581 "Expected the value type to be the same for both operands!");
24582 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
24583 return SDValue();
24584
24585 if (isNullConstant(LHS))
24586 std::swap(LHS, RHS);
24587
24588 if (!isNullConstant(RHS))
24589 return SDValue();
24590
24591 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
24592 LHS.getOpcode() == ISD::SRL)
24593 return SDValue();
24594
24595 // Fold the compare into the branch instruction.
24596 SDValue BR;
24597 if (CC == AArch64CC::EQ)
24598 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
24599 else
24600 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
24601
24602 // Do not add new nodes to DAG combiner worklist.
24603 DCI.CombineTo(N, BR, false);
24604
24605 return SDValue();
24606}
24607
24609 unsigned CC = N->getConstantOperandVal(2);
24610 SDValue SUBS = N->getOperand(3);
24611 SDValue Zero, CTTZ;
24612
24613 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
24614 Zero = N->getOperand(0);
24615 CTTZ = N->getOperand(1);
24616 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
24617 Zero = N->getOperand(1);
24618 CTTZ = N->getOperand(0);
24619 } else
24620 return SDValue();
24621
24622 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
24623 (CTTZ.getOpcode() == ISD::TRUNCATE &&
24624 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
24625 return SDValue();
24626
24627 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
24628 "Illegal type in CTTZ folding");
24629
24630 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
24631 return SDValue();
24632
24633 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
24634 ? CTTZ.getOperand(0).getOperand(0)
24635 : CTTZ.getOperand(0);
24636
24637 if (X != SUBS.getOperand(0))
24638 return SDValue();
24639
24640 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
24641 ? CTTZ.getOperand(0).getValueSizeInBits()
24642 : CTTZ.getValueSizeInBits();
24643 SDValue BitWidthMinusOne =
24644 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
24645 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
24646 BitWidthMinusOne);
24647}
24648
24649// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
24650// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
24651// Where x and y are constants and x != y
24652
24653// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
24654// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
24655// Where x and y are constants and x != y
24657 SDValue L = Op->getOperand(0);
24658 SDValue R = Op->getOperand(1);
24659 AArch64CC::CondCode OpCC =
24660 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
24661
24662 SDValue OpCmp = Op->getOperand(3);
24663 if (!isCMP(OpCmp))
24664 return SDValue();
24665
24666 SDValue CmpLHS = OpCmp.getOperand(0);
24667 SDValue CmpRHS = OpCmp.getOperand(1);
24668
24669 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
24670 std::swap(CmpLHS, CmpRHS);
24671 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
24672 return SDValue();
24673
24674 SDValue X = CmpLHS->getOperand(0);
24675 SDValue Y = CmpLHS->getOperand(1);
24676 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
24677 return SDValue();
24678 }
24679
24680 // If one of the constant is opaque constant, x,y sdnode is still different
24681 // but the real value maybe the same. So check APInt here to make sure the
24682 // code is correct.
24683 ConstantSDNode *CX = cast<ConstantSDNode>(X);
24684 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
24685 if (CX->getAPIntValue() == CY->getAPIntValue())
24686 return SDValue();
24687
24689 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
24690 SDValue Cond = CmpLHS->getOperand(3);
24691
24692 if (CmpRHS == Y)
24694 else if (CmpRHS != X)
24695 return SDValue();
24696
24697 if (OpCC == AArch64CC::NE)
24699 else if (OpCC != AArch64CC::EQ)
24700 return SDValue();
24701
24702 SDLoc DL(Op);
24703 EVT VT = Op->getValueType(0);
24704
24705 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
24706 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
24707}
24708
24709// Reassociate the true/false expressions of a CSEL instruction to obtain a
24710// common subexpression with the comparison instruction. For example, change
24711// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
24712// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
24713// subexpression.
24715 SDValue SubsNode = N->getOperand(3);
24716 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
24717 return SDValue();
24718
24719 SDValue CmpOpToMatch = SubsNode.getOperand(1);
24720 SDValue CmpOpOther = SubsNode.getOperand(0);
24721 EVT VT = N->getValueType(0);
24722
24723 unsigned ExpectedOpcode;
24724 SDValue ExpectedOp;
24725 SDValue SubsOp;
24726 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
24727 if (CmpOpConst) {
24728 ExpectedOpcode = ISD::ADD;
24729 ExpectedOp =
24730 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
24731 CmpOpConst->getValueType(0));
24732 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
24733 CmpOpConst->getValueType(0));
24734 } else {
24735 ExpectedOpcode = ISD::SUB;
24736 ExpectedOp = CmpOpToMatch;
24737 SubsOp = CmpOpToMatch;
24738 }
24739
24740 // Get the operand that can be reassociated with the SUBS instruction.
24741 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
24742 if (Op.getOpcode() != ExpectedOpcode)
24743 return SDValue();
24744 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
24745 !Op.getOperand(0).hasOneUse())
24746 return SDValue();
24747 SDValue X = Op.getOperand(0).getOperand(0);
24748 SDValue Y = Op.getOperand(0).getOperand(1);
24749 if (X != CmpOpOther)
24750 std::swap(X, Y);
24751 if (X != CmpOpOther)
24752 return SDValue();
24753 if (ExpectedOp != Op.getOperand(1))
24754 return SDValue();
24755 return Y;
24756 };
24757
24758 // Try the reassociation using the given constant and condition code.
24759 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
24760 SDValue SubsOp) {
24761 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
24762 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
24763 if (!TReassocOp && !FReassocOp)
24764 return SDValue();
24765
24766 SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
24767 DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp);
24768
24769 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
24770 if (!ReassocOp)
24771 return N->getOperand(OpNum);
24772 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
24773 NewCmp.getValue(0), ReassocOp);
24774 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
24775 return Res;
24776 };
24777
24778 SDValue TValReassoc = Reassociate(TReassocOp, 0);
24779 SDValue FValReassoc = Reassociate(FReassocOp, 1);
24780 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
24781 DAG.getConstant(NewCC, SDLoc(N->getOperand(2)), MVT_CC),
24782 NewCmp.getValue(1));
24783 };
24784
24785 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
24786
24787 // First, try to eliminate the compare instruction by searching for a
24788 // subtraction with the same constant.
24789 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
24790 return R;
24791
24792 if (!CmpOpConst) {
24793 // Try again with the operands of the SUBS instruction and the condition
24794 // swapped. Due to canonicalization, this only helps for non-constant
24795 // operands of the SUBS instruction.
24796 std::swap(CmpOpToMatch, CmpOpOther);
24797 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
24798 return R;
24799 return SDValue();
24800 }
24801
24802 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
24803 return SDValue();
24804
24805 // Next, search for a subtraction with a slightly different constant. By
24806 // adjusting the condition code, we can still eliminate the compare
24807 // instruction. Adjusting the constant is only valid if it does not result
24808 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
24809 // Since such comparisons are trivially true/false, we should not encounter
24810 // them here but check for them nevertheless to be on the safe side.
24811 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
24812 AArch64CC::CondCode NewCC) {
24813 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
24814 CmpOpConst->getValueType(0));
24815 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
24816 CmpOpConst->getValueType(0));
24817 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
24818 };
24819 switch (CC) {
24820 case AArch64CC::EQ:
24821 case AArch64CC::LS:
24822 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
24823 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
24824 case AArch64CC::NE:
24825 case AArch64CC::HI:
24826 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
24827 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
24828 case AArch64CC::LO:
24829 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
24830 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
24831 case AArch64CC::HS:
24832 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
24833 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
24834 case AArch64CC::LT:
24835 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
24836 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
24837 case AArch64CC::LE:
24838 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
24839 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
24840 case AArch64CC::GT:
24841 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
24842 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
24843 case AArch64CC::GE:
24844 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
24845 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
24846 default:
24847 return SDValue();
24848 }
24849}
24850
24851// Optimize CSEL instructions
24854 SelectionDAG &DAG) {
24855 // CSEL x, x, cc -> x
24856 if (N->getOperand(0) == N->getOperand(1))
24857 return N->getOperand(0);
24858
24859 if (SDValue R = foldCSELOfCSEL(N, DAG))
24860 return R;
24861
24862 // Try to reassociate the true/false expressions so that we can do CSE with
24863 // a SUBS instruction used to perform the comparison.
24865 return R;
24866
24867 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
24868 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
24869 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
24870 return Folded;
24871
24872 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
24873 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
24874 SDValue Cond = N->getOperand(3);
24875 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
24876 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
24877 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
24878 {Cond.getOperand(1), Cond.getOperand(0)}) &&
24879 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
24880 {Cond.getOperand(0), Cond.getOperand(1)}) &&
24881 !isNullConstant(Cond.getOperand(1))) {
24882 AArch64CC::CondCode OldCond =
24883 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
24884 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
24885 if (NewCond != AArch64CC::AL) {
24886 SDLoc DL(N);
24887 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
24888 Cond.getOperand(1), Cond.getOperand(0));
24889 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
24890 N->getOperand(1),
24891 DAG.getConstant(NewCond, DL, MVT::i32),
24892 Sub.getValue(1));
24893 }
24894 }
24895
24896 return performCONDCombine(N, DCI, DAG, 2, 3);
24897}
24898
24899// Try to re-use an already extended operand of a vector SetCC feeding a
24900// extended select. Doing so avoids requiring another full extension of the
24901// SET_CC result when lowering the select.
24903 EVT Op0MVT = Op->getOperand(0).getValueType();
24904 if (!Op0MVT.isVector() || Op->use_empty())
24905 return SDValue();
24906
24907 // Make sure that all uses of Op are VSELECTs with result matching types where
24908 // the result type has a larger element type than the SetCC operand.
24909 SDNode *FirstUse = *Op->user_begin();
24910 if (FirstUse->getOpcode() != ISD::VSELECT)
24911 return SDValue();
24912 EVT UseMVT = FirstUse->getValueType(0);
24913 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
24914 return SDValue();
24915 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
24916 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
24917 }))
24918 return SDValue();
24919
24920 APInt V;
24921 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
24922 return SDValue();
24923
24924 SDLoc DL(Op);
24925 SDValue Op0ExtV;
24926 SDValue Op1ExtV;
24927 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
24928 // Check if the first operand of the SET_CC is already extended. If it is,
24929 // split the SET_CC and re-use the extended version of the operand.
24930 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
24931 Op->getOperand(0));
24932 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
24933 Op->getOperand(0));
24934 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
24935 Op0ExtV = SDValue(Op0SExt, 0);
24936 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
24937 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
24938 Op0ExtV = SDValue(Op0ZExt, 0);
24939 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
24940 } else
24941 return SDValue();
24942
24943 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
24944 Op0ExtV, Op1ExtV, Op->getOperand(2));
24945}
24946
24947static SDValue
24949 SelectionDAG &DAG) {
24950 SDValue Vec = N->getOperand(0);
24951 if (DCI.isBeforeLegalize() &&
24952 Vec.getValueType().getVectorElementType() == MVT::i1 &&
24955 SDLoc DL(N);
24956 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
24957 DAG);
24958 }
24959
24960 return SDValue();
24961}
24962
24965 SelectionDAG &DAG) {
24966 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
24967 SDValue LHS = N->getOperand(0);
24968 SDValue RHS = N->getOperand(1);
24969 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
24970 SDLoc DL(N);
24971 EVT VT = N->getValueType(0);
24972
24973 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
24974 return V;
24975
24976 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
24977 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
24978 LHS->getOpcode() == AArch64ISD::CSEL &&
24979 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
24980 LHS->hasOneUse()) {
24981 // Invert CSEL's condition.
24982 auto OldCond =
24983 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
24984 auto NewCond = getInvertedCondCode(OldCond);
24985
24986 // csel 0, 1, !cond, X
24987 SDValue CSEL =
24988 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
24989 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
24990 LHS.getOperand(3));
24991 return DAG.getZExtOrTrunc(CSEL, DL, VT);
24992 }
24993
24994 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
24995 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
24996 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
24997 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
24998 LHS->hasOneUse()) {
24999 EVT TstVT = LHS->getValueType(0);
25000 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
25001 // this pattern will get better opt in emitComparison
25002 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
25003 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
25004 DAG.getSignedConstant(TstImm, DL, TstVT));
25005 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
25006 }
25007 }
25008
25009 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
25010 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
25011 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
25012 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
25013 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25014 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
25016 LHS->getOpcode() == ISD::BITCAST) {
25017 EVT ToVT = LHS->getValueType(0);
25018 EVT FromVT = LHS->getOperand(0).getValueType();
25019 if (FromVT.isFixedLengthVector() &&
25020 FromVT.getVectorElementType() == MVT::i1) {
25021 bool IsNull = isNullConstant(RHS);
25023 DL, MVT::i1, LHS->getOperand(0));
25024 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
25025 LHS);
25026 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25027 }
25028 }
25029
25030 // Try to perform the memcmp when the result is tested for [in]equality with 0
25031 if (SDValue V = performOrXorChainCombine(N, DAG))
25032 return V;
25033
25034 return SDValue();
25035}
25036
25037// Replace a flag-setting operator (eg ANDS) with the generic version
25038// (eg AND) if the flag is unused.
25041 unsigned GenericOpcode) {
25042 SDLoc DL(N);
25043 SDValue LHS = N->getOperand(0);
25044 SDValue RHS = N->getOperand(1);
25045 EVT VT = N->getValueType(0);
25046
25047 // If the flag result isn't used, convert back to a generic opcode.
25048 if (!N->hasAnyUseOfValue(1)) {
25049 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
25050 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
25051 DL);
25052 }
25053
25054 // Combine identical generic nodes into this node, re-using the result.
25055 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25056 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
25057 DCI.CombineTo(Generic, SDValue(N, 0));
25058
25059 return SDValue();
25060}
25061
25063 // setcc_merge_zero pred
25064 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25065 // => extract_subvector (inner setcc_merge_zero)
25066 SDValue Pred = N->getOperand(0);
25067 SDValue LHS = N->getOperand(1);
25068 SDValue RHS = N->getOperand(2);
25069 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25070
25071 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
25072 LHS->getOpcode() != ISD::SIGN_EXTEND)
25073 return SDValue();
25074
25075 SDValue Extract = LHS->getOperand(0);
25076 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25077 Extract->getValueType(0) != N->getValueType(0) ||
25078 Extract->getConstantOperandVal(1) != 0)
25079 return SDValue();
25080
25081 SDValue InnerSetCC = Extract->getOperand(0);
25082 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25083 return SDValue();
25084
25085 // By this point we've effectively got
25086 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25087 // lanes are already zero then the trunc(sext()) sequence is redundant and we
25088 // can operate on A directly.
25089 SDValue InnerPred = InnerSetCC.getOperand(0);
25090 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25091 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25092 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
25093 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
25094 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
25095 return Extract;
25096
25097 return SDValue();
25098}
25099
25100static SDValue
25102 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25103 "Unexpected opcode!");
25104
25105 SelectionDAG &DAG = DCI.DAG;
25106 SDValue Pred = N->getOperand(0);
25107 SDValue LHS = N->getOperand(1);
25108 SDValue RHS = N->getOperand(2);
25109 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
25110
25111 if (SDValue V = performSetCCPunpkCombine(N, DAG))
25112 return V;
25113
25114 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
25115 LHS->getOpcode() == ISD::SIGN_EXTEND &&
25116 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
25117 // setcc_merge_zero(
25118 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25119 // => setcc_merge_zero(pred, ...)
25120 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25121 LHS->getOperand(0)->getOperand(0) == Pred)
25122 return LHS->getOperand(0);
25123
25124 // setcc_merge_zero(
25125 // all_active, extend(nxvNi1 ...), != splat(0))
25126 // -> nxvNi1 ...
25127 if (isAllActivePredicate(DAG, Pred))
25128 return LHS->getOperand(0);
25129
25130 // setcc_merge_zero(
25131 // pred, extend(nxvNi1 ...), != splat(0))
25132 // -> nxvNi1 and(pred, ...)
25133 if (DCI.isAfterLegalizeDAG())
25134 // Do this after legalization to allow more folds on setcc_merge_zero
25135 // to be recognized.
25136 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
25137 LHS->getOperand(0), Pred);
25138 }
25139
25140 return SDValue();
25141}
25142
25143// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
25144// as well as whether the test should be inverted. This code is required to
25145// catch these cases (as opposed to standard dag combines) because
25146// AArch64ISD::TBZ is matched during legalization.
25147static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25148 SelectionDAG &DAG) {
25149
25150 if (!Op->hasOneUse())
25151 return Op;
25152
25153 // We don't handle undef/constant-fold cases below, as they should have
25154 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
25155 // etc.)
25156
25157 // (tbz (trunc x), b) -> (tbz x, b)
25158 // This case is just here to enable more of the below cases to be caught.
25159 if (Op->getOpcode() == ISD::TRUNCATE &&
25160 Bit < Op->getValueType(0).getSizeInBits()) {
25161 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25162 }
25163
25164 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25165 if (Op->getOpcode() == ISD::ANY_EXTEND &&
25166 Bit < Op->getOperand(0).getValueSizeInBits()) {
25167 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25168 }
25169
25170 if (Op->getNumOperands() != 2)
25171 return Op;
25172
25173 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
25174 if (!C)
25175 return Op;
25176
25177 switch (Op->getOpcode()) {
25178 default:
25179 return Op;
25180
25181 // (tbz (and x, m), b) -> (tbz x, b)
25182 case ISD::AND:
25183 if ((C->getZExtValue() >> Bit) & 1)
25184 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25185 return Op;
25186
25187 // (tbz (shl x, c), b) -> (tbz x, b-c)
25188 case ISD::SHL:
25189 if (C->getZExtValue() <= Bit &&
25190 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25191 Bit = Bit - C->getZExtValue();
25192 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25193 }
25194 return Op;
25195
25196 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
25197 case ISD::SRA:
25198 Bit = Bit + C->getZExtValue();
25199 if (Bit >= Op->getValueType(0).getSizeInBits())
25200 Bit = Op->getValueType(0).getSizeInBits() - 1;
25201 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25202
25203 // (tbz (srl x, c), b) -> (tbz x, b+c)
25204 case ISD::SRL:
25205 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
25206 Bit = Bit + C->getZExtValue();
25207 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25208 }
25209 return Op;
25210
25211 // (tbz (xor x, -1), b) -> (tbnz x, b)
25212 case ISD::XOR:
25213 if ((C->getZExtValue() >> Bit) & 1)
25214 Invert = !Invert;
25215 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
25216 }
25217}
25218
25219// Optimize test single bit zero/non-zero and branch.
25222 SelectionDAG &DAG) {
25223 unsigned Bit = N->getConstantOperandVal(2);
25224 bool Invert = false;
25225 SDValue TestSrc = N->getOperand(1);
25226 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
25227
25228 if (TestSrc == NewTestSrc)
25229 return SDValue();
25230
25231 unsigned NewOpc = N->getOpcode();
25232 if (Invert) {
25233 if (NewOpc == AArch64ISD::TBZ)
25234 NewOpc = AArch64ISD::TBNZ;
25235 else {
25236 assert(NewOpc == AArch64ISD::TBNZ);
25237 NewOpc = AArch64ISD::TBZ;
25238 }
25239 }
25240
25241 SDLoc DL(N);
25242 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
25243 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
25244}
25245
25246// Swap vselect operands where it may allow a predicated operation to achieve
25247// the `sel`.
25248//
25249// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
25250// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
25252 auto SelectA = N->getOperand(1);
25253 auto SelectB = N->getOperand(2);
25254 auto NTy = N->getValueType(0);
25255
25256 if (!NTy.isScalableVector())
25257 return SDValue();
25258 SDValue SetCC = N->getOperand(0);
25259 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
25260 return SDValue();
25261
25262 switch (SelectB.getOpcode()) {
25263 default:
25264 return SDValue();
25265 case ISD::FMUL:
25266 case ISD::FSUB:
25267 case ISD::FADD:
25268 break;
25269 }
25270 if (SelectA != SelectB.getOperand(0))
25271 return SDValue();
25272
25273 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
25274 ISD::CondCode InverseCC =
25276 auto InverseSetCC =
25277 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
25278 SetCC.getOperand(1), InverseCC);
25279
25280 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
25281 {InverseSetCC, SelectB, SelectA});
25282}
25283
25284// vselect (v1i1 setcc) ->
25285// vselect (v1iXX setcc) (XX is the size of the compared operand type)
25286// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
25287// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
25288// such VSELECT.
25290 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
25291 return SwapResult;
25292
25293 SDValue N0 = N->getOperand(0);
25294 EVT CCVT = N0.getValueType();
25295
25296 if (isAllActivePredicate(DAG, N0))
25297 return N->getOperand(1);
25298
25299 if (isAllInactivePredicate(N0))
25300 return N->getOperand(2);
25301
25302 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
25303 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
25304 // supported types.
25305 SDValue SetCC = N->getOperand(0);
25306 if (SetCC.getOpcode() == ISD::SETCC &&
25307 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
25308 SDValue CmpLHS = SetCC.getOperand(0);
25309 EVT VT = CmpLHS.getValueType();
25310 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
25311 SDNode *SplatLHS = N->getOperand(1).getNode();
25312 SDNode *SplatRHS = N->getOperand(2).getNode();
25313 APInt SplatLHSVal;
25314 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
25315 VT.isSimple() &&
25316 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
25317 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
25318 VT.getSimpleVT().SimpleTy) &&
25319 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
25320 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
25322 unsigned NumElts = VT.getVectorNumElements();
25324 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
25325 VT.getScalarType()));
25326 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
25327
25328 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
25329 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
25330 return Or;
25331 }
25332 }
25333
25334 EVT CmpVT = N0.getOperand(0).getValueType();
25335 if (N0.getOpcode() != ISD::SETCC ||
25337 CCVT.getVectorElementType() != MVT::i1 ||
25339 return SDValue();
25340
25341 EVT ResVT = N->getValueType(0);
25342 // Only combine when the result type is of the same size as the compared
25343 // operands.
25344 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
25345 return SDValue();
25346
25347 SDValue IfTrue = N->getOperand(1);
25348 SDValue IfFalse = N->getOperand(2);
25349 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
25350 N0.getOperand(0), N0.getOperand(1),
25351 cast<CondCodeSDNode>(N0.getOperand(2))->get());
25352 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
25353 IfTrue, IfFalse);
25354}
25355
25356/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
25357/// the compare-mask instructions rather than going via NZCV, even if LHS and
25358/// RHS are really scalar. This replaces any scalar setcc in the above pattern
25359/// with a vector one followed by a DUP shuffle on the result.
25362 SelectionDAG &DAG = DCI.DAG;
25363 SDValue N0 = N->getOperand(0);
25364 EVT ResVT = N->getValueType(0);
25365
25366 if (N0.getOpcode() != ISD::SETCC)
25367 return SDValue();
25368
25369 if (ResVT.isScalableVT())
25370 return SDValue();
25371
25372 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
25373 // scalar SetCCResultType. We also don't expect vectors, because we assume
25374 // that selects fed by vector SETCCs are canonicalized to VSELECT.
25375 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
25376 "Scalar-SETCC feeding SELECT has unexpected result type!");
25377
25378 // If NumMaskElts == 0, the comparison is larger than select result. The
25379 // largest real NEON comparison is 64-bits per lane, which means the result is
25380 // at most 32-bits and an illegal vector. Just bail out for now.
25381 EVT SrcVT = N0.getOperand(0).getValueType();
25382
25383 // Don't try to do this optimization when the setcc itself has i1 operands.
25384 // There are no legal vectors of i1, so this would be pointless. v1f16 is
25385 // ruled out to prevent the creation of setcc that need to be scalarized.
25386 if (SrcVT == MVT::i1 ||
25387 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
25388 return SDValue();
25389
25390 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
25391 if (!ResVT.isVector() || NumMaskElts == 0)
25392 return SDValue();
25393
25394 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
25396
25397 // Also bail out if the vector CCVT isn't the same size as ResVT.
25398 // This can happen if the SETCC operand size doesn't divide the ResVT size
25399 // (e.g., f64 vs v3f32).
25400 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
25401 return SDValue();
25402
25403 // Make sure we didn't create illegal types, if we're not supposed to.
25404 assert(DCI.isBeforeLegalize() ||
25405 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
25406
25407 // First perform a vector comparison, where lane 0 is the one we're interested
25408 // in.
25409 SDLoc DL(N0);
25410 SDValue LHS =
25411 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
25412 SDValue RHS =
25413 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
25414 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
25415
25416 // Now duplicate the comparison mask we want across all other lanes.
25417 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
25418 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
25419 Mask = DAG.getNode(ISD::BITCAST, DL,
25420 ResVT.changeVectorElementTypeToInteger(), Mask);
25421
25422 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
25423}
25424
25427 EVT VT = N->getValueType(0);
25428 SDLoc DL(N);
25429 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
25430 // 128bit vector version.
25431 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
25433 SmallVector<SDValue> Ops(N->ops());
25434 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
25435 DCI.DAG.getVTList(LVT), Ops)) {
25436 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
25437 DCI.DAG.getConstant(0, DL, MVT::i64));
25438 }
25439 }
25440
25441 if (N->getOpcode() == AArch64ISD::DUP) {
25442 if (DCI.isAfterLegalizeDAG()) {
25443 // If scalar dup's operand is extract_vector_elt, try to combine them into
25444 // duplane. For example,
25445 //
25446 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
25447 // t18: v4i32 = AArch64ISD::DUP t21
25448 // ==>
25449 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
25450 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
25451 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25452 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
25453 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
25454 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
25455 EXTRACT_VEC_ELT.getOperand(1));
25456 }
25457 }
25458 }
25459
25460 return performPostLD1Combine(N, DCI, false);
25461 }
25462
25463 return SDValue();
25464}
25465
25466/// Get rid of unnecessary NVCASTs (that don't change the type).
25468 if (N->getValueType(0) == N->getOperand(0).getValueType())
25469 return N->getOperand(0);
25470 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
25471 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
25472 N->getOperand(0).getOperand(0));
25473
25474 return SDValue();
25475}
25476
25477// If all users of the globaladdr are of the form (globaladdr + constant), find
25478// the smallest constant, fold it into the globaladdr's offset and rewrite the
25479// globaladdr as (globaladdr + constant) - constant.
25481 const AArch64Subtarget *Subtarget,
25482 const TargetMachine &TM) {
25483 auto *GN = cast<GlobalAddressSDNode>(N);
25484 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
25486 return SDValue();
25487
25488 uint64_t MinOffset = -1ull;
25489 for (SDNode *N : GN->users()) {
25490 if (N->getOpcode() != ISD::ADD)
25491 return SDValue();
25492 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
25493 if (!C)
25494 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
25495 if (!C)
25496 return SDValue();
25497 MinOffset = std::min(MinOffset, C->getZExtValue());
25498 }
25499 uint64_t Offset = MinOffset + GN->getOffset();
25500
25501 // Require that the new offset is larger than the existing one. Otherwise, we
25502 // can end up oscillating between two possible DAGs, for example,
25503 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
25504 if (Offset <= uint64_t(GN->getOffset()))
25505 return SDValue();
25506
25507 // Check whether folding this offset is legal. It must not go out of bounds of
25508 // the referenced object to avoid violating the code model, and must be
25509 // smaller than 2^20 because this is the largest offset expressible in all
25510 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
25511 // stores an immediate signed 21 bit offset.)
25512 //
25513 // This check also prevents us from folding negative offsets, which will end
25514 // up being treated in the same way as large positive ones. They could also
25515 // cause code model violations, and aren't really common enough to matter.
25516 if (Offset >= (1 << 20))
25517 return SDValue();
25518
25519 const GlobalValue *GV = GN->getGlobal();
25520 Type *T = GV->getValueType();
25521 if (!T->isSized() ||
25523 return SDValue();
25524
25525 SDLoc DL(GN);
25526 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
25527 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
25528 DAG.getConstant(MinOffset, DL, MVT::i64));
25529}
25530
25532 const AArch64Subtarget *Subtarget) {
25533 SDValue BR = N->getOperand(0);
25534 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
25535 !BR.getValueType().isScalarInteger())
25536 return SDValue();
25537
25538 SDLoc DL(N);
25539 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
25540}
25541
25542// Turns the vector of indices into a vector of byte offstes by scaling Offset
25543// by (BitWidth / 8).
25545 SDLoc DL, unsigned BitWidth) {
25546 assert(Offset.getValueType().isScalableVector() &&
25547 "This method is only for scalable vectors of offsets");
25548
25549 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
25550 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
25551
25552 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
25553}
25554
25555/// Check if the value of \p OffsetInBytes can be used as an immediate for
25556/// the gather load/prefetch and scatter store instructions with vector base and
25557/// immediate offset addressing mode:
25558///
25559/// [<Zn>.[S|D]{, #<imm>}]
25560///
25561/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25562inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
25563 unsigned ScalarSizeInBytes) {
25564 // The immediate is not a multiple of the scalar size.
25565 if (OffsetInBytes % ScalarSizeInBytes)
25566 return false;
25567
25568 // The immediate is out of range.
25569 if (OffsetInBytes / ScalarSizeInBytes > 31)
25570 return false;
25571
25572 return true;
25573}
25574
25575/// Check if the value of \p Offset represents a valid immediate for the SVE
25576/// gather load/prefetch and scatter store instructiona with vector base and
25577/// immediate offset addressing mode:
25578///
25579/// [<Zn>.[S|D]{, #<imm>}]
25580///
25581/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
25583 unsigned ScalarSizeInBytes) {
25584 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
25585 return OffsetConst && isValidImmForSVEVecImmAddrMode(
25586 OffsetConst->getZExtValue(), ScalarSizeInBytes);
25587}
25588
25590 unsigned Opcode,
25591 bool OnlyPackedOffsets = true) {
25592 const SDValue Src = N->getOperand(2);
25593 const EVT SrcVT = Src->getValueType(0);
25594 assert(SrcVT.isScalableVector() &&
25595 "Scatter stores are only possible for SVE vectors");
25596
25597 SDLoc DL(N);
25598 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
25599
25600 // Make sure that source data will fit into an SVE register
25602 return SDValue();
25603
25604 // For FPs, ACLE only supports _packed_ single and double precision types.
25605 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
25606 if (SrcElVT.isFloatingPoint())
25607 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
25608 ((Opcode != AArch64ISD::SST1Q_PRED &&
25609 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
25610 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
25611 return SDValue();
25612
25613 // Depending on the addressing mode, this is either a pointer or a vector of
25614 // pointers (that fits into one register)
25615 SDValue Base = N->getOperand(4);
25616 // Depending on the addressing mode, this is either a single offset or a
25617 // vector of offsets (that fits into one register)
25618 SDValue Offset = N->getOperand(5);
25619
25620 // For "scalar + vector of indices", just scale the indices. This only
25621 // applies to non-temporal scatters because there's no instruction that takes
25622 // indices.
25623 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
25624 Offset =
25626 Opcode = AArch64ISD::SSTNT1_PRED;
25627 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
25628 Offset =
25630 Opcode = AArch64ISD::SST1Q_PRED;
25631 }
25632
25633 // In the case of non-temporal gather loads there's only one SVE instruction
25634 // per data-size: "scalar + vector", i.e.
25635 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25636 // Since we do have intrinsics that allow the arguments to be in a different
25637 // order, we may need to swap them to match the spec.
25638 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
25639 Offset.getValueType().isVector())
25641
25642 // SST1_IMM requires that the offset is an immediate that is:
25643 // * a multiple of #SizeInBytes,
25644 // * in the range [0, 31 x #SizeInBytes],
25645 // where #SizeInBytes is the size in bytes of the stored items. For
25646 // immediates outside that range and non-immediate scalar offsets use SST1 or
25647 // SST1_UXTW instead.
25648 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
25650 SrcVT.getScalarSizeInBits() / 8)) {
25651 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25653 else
25654 Opcode = AArch64ISD::SST1_PRED;
25655
25657 }
25658 }
25659
25660 auto &TLI = DAG.getTargetLoweringInfo();
25661 if (!TLI.isTypeLegal(Base.getValueType()))
25662 return SDValue();
25663
25664 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
25665 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25666 // nxv2i64. Legalize accordingly.
25667 if (!OnlyPackedOffsets &&
25668 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25669 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
25670
25671 if (!TLI.isTypeLegal(Offset.getValueType()))
25672 return SDValue();
25673
25674 // Source value type that is representable in hardware
25675 EVT HwSrcVt = getSVEContainerType(SrcVT);
25676
25677 // Keep the original type of the input data to store - this is needed to be
25678 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
25679 // FP values we want the integer equivalent, so just use HwSrcVt.
25680 SDValue InputVT = DAG.getValueType(SrcVT);
25681 if (SrcVT.isFloatingPoint())
25682 InputVT = DAG.getValueType(HwSrcVt);
25683
25684 SDVTList VTs = DAG.getVTList(MVT::Other);
25685 SDValue SrcNew;
25686
25687 if (Src.getValueType().isFloatingPoint())
25688 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
25689 else
25690 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
25691
25692 SDValue Ops[] = {N->getOperand(0), // Chain
25693 SrcNew,
25694 N->getOperand(3), // Pg
25695 Base,
25696 Offset,
25697 InputVT};
25698
25699 return DAG.getNode(Opcode, DL, VTs, Ops);
25700}
25701
25703 unsigned Opcode,
25704 bool OnlyPackedOffsets = true) {
25705 const EVT RetVT = N->getValueType(0);
25706 assert(RetVT.isScalableVector() &&
25707 "Gather loads are only possible for SVE vectors");
25708
25709 SDLoc DL(N);
25710
25711 // Make sure that the loaded data will fit into an SVE register
25713 return SDValue();
25714
25715 // Depending on the addressing mode, this is either a pointer or a vector of
25716 // pointers (that fits into one register)
25717 SDValue Base = N->getOperand(3);
25718 // Depending on the addressing mode, this is either a single offset or a
25719 // vector of offsets (that fits into one register)
25720 SDValue Offset = N->getOperand(4);
25721
25722 // For "scalar + vector of indices", scale the indices to obtain unscaled
25723 // offsets. This applies to non-temporal and quadword gathers, which do not
25724 // have an addressing mode with scaled offset.
25727 RetVT.getScalarSizeInBits());
25729 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
25731 RetVT.getScalarSizeInBits());
25733 }
25734
25735 // In the case of non-temporal gather loads and quadword gather loads there's
25736 // only one addressing mode : "vector + scalar", e.g.
25737 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
25738 // Since we do have intrinsics that allow the arguments to be in a different
25739 // order, we may need to swap them to match the spec.
25740 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
25741 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
25742 Offset.getValueType().isVector())
25744
25745 // GLD{FF}1_IMM requires that the offset is an immediate that is:
25746 // * a multiple of #SizeInBytes,
25747 // * in the range [0, 31 x #SizeInBytes],
25748 // where #SizeInBytes is the size in bytes of the loaded items. For
25749 // immediates outside that range and non-immediate scalar offsets use
25750 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
25751 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
25754 RetVT.getScalarSizeInBits() / 8)) {
25755 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25756 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
25759 else
25760 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
25763
25765 }
25766 }
25767
25768 auto &TLI = DAG.getTargetLoweringInfo();
25769 if (!TLI.isTypeLegal(Base.getValueType()))
25770 return SDValue();
25771
25772 // Some gather load variants allow unpacked offsets, but only as nxv2i32
25773 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25774 // nxv2i64. Legalize accordingly.
25775 if (!OnlyPackedOffsets &&
25776 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25777 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
25778
25779 // Return value type that is representable in hardware
25780 EVT HwRetVt = getSVEContainerType(RetVT);
25781
25782 // Keep the original output value type around - this is needed to be able to
25783 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
25784 // values we want the integer equivalent, so just use HwRetVT.
25785 SDValue OutVT = DAG.getValueType(RetVT);
25786 if (RetVT.isFloatingPoint())
25787 OutVT = DAG.getValueType(HwRetVt);
25788
25789 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
25790 SDValue Ops[] = {N->getOperand(0), // Chain
25791 N->getOperand(2), // Pg
25792 Base, Offset, OutVT};
25793
25794 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
25795 SDValue LoadChain = SDValue(Load.getNode(), 1);
25796
25797 if (RetVT.isInteger() && (RetVT != HwRetVt))
25798 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
25799
25800 // If the original return value was FP, bitcast accordingly. Doing it here
25801 // means that we can avoid adding TableGen patterns for FPs.
25802 if (RetVT.isFloatingPoint())
25803 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
25804
25805 return DAG.getMergeValues({Load, LoadChain}, DL);
25806}
25807
25808static SDValue
25810 SelectionDAG &DAG) {
25811 SDLoc DL(N);
25812 SDValue Src = N->getOperand(0);
25813 unsigned Opc = Src->getOpcode();
25814
25815 // Sign extend of an unsigned unpack -> signed unpack
25816 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
25817
25818 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
25820
25821 // Push the sign extend to the operand of the unpack
25822 // This is necessary where, for example, the operand of the unpack
25823 // is another unpack:
25824 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
25825 // ->
25826 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
25827 // ->
25828 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
25829 SDValue ExtOp = Src->getOperand(0);
25830 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
25831 EVT EltTy = VT.getVectorElementType();
25832 (void)EltTy;
25833
25834 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
25835 "Sign extending from an invalid type");
25836
25837 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
25838
25840 ExtOp, DAG.getValueType(ExtVT));
25841
25842 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
25843 }
25844
25845 if (DCI.isBeforeLegalizeOps())
25846 return SDValue();
25847
25849 return SDValue();
25850
25851 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
25852 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
25853 unsigned NewOpc;
25854 unsigned MemVTOpNum = 4;
25855 switch (Opc) {
25858 MemVTOpNum = 3;
25859 break;
25862 MemVTOpNum = 3;
25863 break;
25866 MemVTOpNum = 3;
25867 break;
25870 break;
25873 break;
25876 break;
25879 break;
25882 break;
25885 break;
25888 break;
25891 break;
25894 break;
25897 break;
25900 break;
25903 break;
25906 break;
25909 break;
25912 break;
25913 default:
25914 return SDValue();
25915 }
25916
25917 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
25918 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
25919
25920 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
25921 return SDValue();
25922
25923 EVT DstVT = N->getValueType(0);
25924 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
25925
25927 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
25928 Ops.push_back(Src->getOperand(I));
25929
25930 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
25931 DCI.CombineTo(N, ExtLoad);
25932 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
25933
25934 // Return N so it doesn't get rechecked
25935 return SDValue(N, 0);
25936}
25937
25938/// Legalize the gather prefetch (scalar + vector addressing mode) when the
25939/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
25940/// != nxv2i32) do not need legalization.
25942 const unsigned OffsetPos = 4;
25943 SDValue Offset = N->getOperand(OffsetPos);
25944
25945 // Not an unpacked vector, bail out.
25946 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
25947 return SDValue();
25948
25949 // Extend the unpacked offset vector to 64-bit lanes.
25950 SDLoc DL(N);
25951 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
25952 SmallVector<SDValue, 5> Ops(N->ops());
25953 // Replace the offset operand with the 64-bit one.
25954 Ops[OffsetPos] = Offset;
25955
25956 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
25957}
25958
25959/// Combines a node carrying the intrinsic
25960/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
25961/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
25962/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
25963/// sve gather prefetch instruction with vector plus immediate addressing mode.
25965 unsigned ScalarSizeInBytes) {
25966 const unsigned ImmPos = 4, OffsetPos = 3;
25967 // No need to combine the node if the immediate is valid...
25968 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
25969 return SDValue();
25970
25971 // ...otherwise swap the offset base with the offset...
25972 SmallVector<SDValue, 5> Ops(N->ops());
25973 std::swap(Ops[ImmPos], Ops[OffsetPos]);
25974 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
25975 // `aarch64_sve_prfb_gather_uxtw_index`.
25976 SDLoc DL(N);
25977 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
25978 MVT::i64);
25979
25980 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
25981}
25982
25983// Return true if the vector operation can guarantee only the first lane of its
25984// result contains data, with all bits in other lanes set to zero.
25986 switch (Op.getOpcode()) {
25987 default:
25988 return false;
26004 return true;
26005 }
26006}
26007
26009 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26010 SDValue InsertVec = N->getOperand(0);
26011 SDValue InsertElt = N->getOperand(1);
26012 SDValue InsertIdx = N->getOperand(2);
26013
26014 // We only care about inserts into the first element...
26015 if (!isNullConstant(InsertIdx))
26016 return SDValue();
26017 // ...of a zero'd vector...
26019 return SDValue();
26020 // ...where the inserted data was previously extracted...
26021 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26022 return SDValue();
26023
26024 SDValue ExtractVec = InsertElt.getOperand(0);
26025 SDValue ExtractIdx = InsertElt.getOperand(1);
26026
26027 // ...from the first element of a vector.
26028 if (!isNullConstant(ExtractIdx))
26029 return SDValue();
26030
26031 // If we get here we are effectively trying to zero lanes 1-N of a vector.
26032
26033 // Ensure there's no type conversion going on.
26034 if (N->getValueType(0) != ExtractVec.getValueType())
26035 return SDValue();
26036
26037 if (!isLanes1toNKnownZero(ExtractVec))
26038 return SDValue();
26039
26040 // The explicit zeroing is redundant.
26041 return ExtractVec;
26042}
26043
26044static SDValue
26047 return Res;
26048
26049 return performPostLD1Combine(N, DCI, true);
26050}
26051
26054 const AArch64Subtarget *Subtarget) {
26055 SDValue N0 = N->getOperand(0);
26056 EVT VT = N->getValueType(0);
26057
26058 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26059 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26060 return SDValue();
26061
26062 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26063 EVT EltVT = VT.getVectorElementType();
26064 return EltVT == MVT::f32 || EltVT == MVT::f64;
26065 };
26066
26067 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26068 // We purposefully don't care about legality of the nodes here as we know
26069 // they can be split down into something legal.
26070 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
26071 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26072 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
26073 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26074 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
26075 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
26076 LN0->getChain(), LN0->getBasePtr(),
26077 N0.getValueType(), LN0->getMemOperand());
26078 DCI.CombineTo(N, ExtLoad);
26079 DCI.CombineTo(
26080 N0.getNode(),
26081 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
26082 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
26083 ExtLoad.getValue(1));
26084 return SDValue(N, 0); // Return N so it doesn't get rechecked!
26085 }
26086
26087 return SDValue();
26088}
26089
26091 const AArch64Subtarget *Subtarget) {
26092 EVT VT = N->getValueType(0);
26093
26094 // Don't expand for NEON, SVE2 or SME
26095 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
26096 return SDValue();
26097
26098 SDLoc DL(N);
26099
26100 SDValue Mask = N->getOperand(0);
26101 SDValue In1 = N->getOperand(1);
26102 SDValue In2 = N->getOperand(2);
26103
26104 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
26105 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
26106 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
26107 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
26108}
26109
26111 EVT VT = N->getValueType(0);
26112
26113 SDValue Insert = N->getOperand(0);
26114 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26115 return SDValue();
26116
26117 if (!Insert.getOperand(0).isUndef())
26118 return SDValue();
26119
26120 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
26121 uint64_t IdxDupLane = N->getConstantOperandVal(1);
26122 if (IdxInsert != 0 || IdxDupLane != 0)
26123 return SDValue();
26124
26125 SDValue Bitcast = Insert.getOperand(1);
26126 if (Bitcast.getOpcode() != ISD::BITCAST)
26127 return SDValue();
26128
26129 SDValue Subvec = Bitcast.getOperand(0);
26130 EVT SubvecVT = Subvec.getValueType();
26131 if (!SubvecVT.is128BitVector())
26132 return SDValue();
26133 EVT NewSubvecVT =
26135
26136 SDLoc DL(N);
26137 SDValue NewInsert =
26138 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
26139 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
26140 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
26141 NewInsert, N->getOperand(1));
26142 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
26143}
26144
26145// Try to combine mull with uzp1.
26148 SelectionDAG &DAG) {
26149 if (DCI.isBeforeLegalizeOps())
26150 return SDValue();
26151
26152 SDValue LHS = N->getOperand(0);
26153 SDValue RHS = N->getOperand(1);
26154
26155 SDValue ExtractHigh;
26156 SDValue ExtractLow;
26157 SDValue TruncHigh;
26158 SDValue TruncLow;
26159 SDLoc DL(N);
26160
26161 // Check the operands are trunc and extract_high.
26163 RHS.getOpcode() == ISD::TRUNCATE) {
26164 TruncHigh = RHS;
26165 if (LHS.getOpcode() == ISD::BITCAST)
26166 ExtractHigh = LHS.getOperand(0);
26167 else
26168 ExtractHigh = LHS;
26170 LHS.getOpcode() == ISD::TRUNCATE) {
26171 TruncHigh = LHS;
26172 if (RHS.getOpcode() == ISD::BITCAST)
26173 ExtractHigh = RHS.getOperand(0);
26174 else
26175 ExtractHigh = RHS;
26176 } else
26177 return SDValue();
26178
26179 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26180 // with uzp1.
26181 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26182 SDValue TruncHighOp = TruncHigh.getOperand(0);
26183 EVT TruncHighOpVT = TruncHighOp.getValueType();
26184 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
26185 DAG.isSplatValue(TruncHighOp, false))
26186 return SDValue();
26187
26188 // Check there is other extract_high with same source vector.
26189 // For example,
26190 //
26191 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
26192 // t12: v4i16 = truncate t11
26193 // t31: v4i32 = AArch64ISD::SMULL t18, t12
26194 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
26195 // t16: v4i16 = truncate t15
26196 // t30: v4i32 = AArch64ISD::SMULL t23, t1
26197 //
26198 // This dagcombine assumes the two extract_high uses same source vector in
26199 // order to detect the pair of the mull. If they have different source vector,
26200 // this code will not work.
26201 // TODO: Should also try to look through a bitcast.
26202 bool HasFoundMULLow = true;
26203 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
26204 if (ExtractHighSrcVec->use_size() != 2)
26205 HasFoundMULLow = false;
26206
26207 // Find ExtractLow.
26208 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
26209 if (User == ExtractHigh.getNode())
26210 continue;
26211
26212 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26214 HasFoundMULLow = false;
26215 break;
26216 }
26217
26218 ExtractLow.setNode(User);
26219 }
26220
26221 if (!ExtractLow || !ExtractLow->hasOneUse())
26222 HasFoundMULLow = false;
26223
26224 // Check ExtractLow's user.
26225 if (HasFoundMULLow) {
26226 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
26227 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
26228 HasFoundMULLow = false;
26229 } else {
26230 if (ExtractLowUser->getOperand(0) == ExtractLow) {
26231 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
26232 TruncLow = ExtractLowUser->getOperand(1);
26233 else
26234 HasFoundMULLow = false;
26235 } else {
26236 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
26237 TruncLow = ExtractLowUser->getOperand(0);
26238 else
26239 HasFoundMULLow = false;
26240 }
26241 }
26242 }
26243
26244 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26245 // with uzp1.
26246 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26247 EVT TruncHighVT = TruncHigh.getValueType();
26248 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
26249 SDValue TruncLowOp =
26250 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
26251 EVT TruncLowOpVT = TruncLowOp.getValueType();
26252 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
26253 DAG.isSplatValue(TruncLowOp, false)))
26254 return SDValue();
26255
26256 // Create uzp1, extract_high and extract_low.
26257 if (TruncHighOpVT != UZP1VT)
26258 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
26259 if (TruncLowOpVT != UZP1VT)
26260 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
26261
26262 SDValue UZP1 =
26263 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
26264 SDValue HighIdxCst =
26265 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
26266 SDValue NewTruncHigh =
26267 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
26268 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
26269
26270 if (HasFoundMULLow) {
26271 EVT TruncLowVT = TruncLow.getValueType();
26272 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
26273 UZP1, ExtractLow.getOperand(1));
26274 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
26275 }
26276
26277 return SDValue(N, 0);
26278}
26279
26282 SelectionDAG &DAG) {
26283 if (SDValue Val =
26285 return Val;
26286
26287 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
26288 return Val;
26289
26290 return SDValue();
26291}
26292
26293static SDValue
26295 SelectionDAG &DAG) {
26296 // Let's do below transform.
26297 //
26298 // t34: v4i32 = AArch64ISD::UADDLV t2
26299 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
26300 // t7: i64 = zero_extend t35
26301 // t20: v1i64 = scalar_to_vector t7
26302 // ==>
26303 // t34: v4i32 = AArch64ISD::UADDLV t2
26304 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
26305 // t40: v1i64 = AArch64ISD::NVCAST t39
26306 if (DCI.isBeforeLegalizeOps())
26307 return SDValue();
26308
26309 EVT VT = N->getValueType(0);
26310 if (VT != MVT::v1i64)
26311 return SDValue();
26312
26313 SDValue ZEXT = N->getOperand(0);
26314 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
26315 return SDValue();
26316
26317 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
26318 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
26319 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
26320 return SDValue();
26321
26322 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
26323 return SDValue();
26324
26325 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
26326 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
26327 UADDLV.getValueType() != MVT::v4i32 ||
26328 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
26329 return SDValue();
26330
26331 // Let's generate new sequence with AArch64ISD::NVCAST.
26332 SDLoc DL(N);
26333 SDValue EXTRACT_SUBVEC =
26334 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
26335 DAG.getConstant(0, DL, MVT::i64));
26336 SDValue NVCAST =
26337 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
26338
26339 return NVCAST;
26340}
26341
26343 DAGCombinerInfo &DCI) const {
26344 SelectionDAG &DAG = DCI.DAG;
26345 switch (N->getOpcode()) {
26346 default:
26347 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
26348 break;
26349 case ISD::VECREDUCE_AND:
26350 case ISD::VECREDUCE_OR:
26351 case ISD::VECREDUCE_XOR:
26352 return performVecReduceBitwiseCombine(N, DCI, DAG);
26353 case ISD::ADD:
26354 case ISD::SUB:
26355 return performAddSubCombine(N, DCI);
26356 case ISD::BUILD_VECTOR:
26357 return performBuildVectorCombine(N, DCI, DAG);
26358 case ISD::TRUNCATE:
26359 return performTruncateCombine(N, DAG, DCI);
26360 case AArch64ISD::ANDS:
26361 return performFlagSettingCombine(N, DCI, ISD::AND);
26362 case AArch64ISD::ADC:
26363 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
26364 return R;
26365 return foldADCToCINC(N, DAG);
26366 case AArch64ISD::SBC:
26367 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
26368 case AArch64ISD::ADCS:
26369 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
26370 return R;
26372 case AArch64ISD::SBCS:
26373 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
26374 return R;
26376 case AArch64ISD::BICi: {
26378 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
26379 APInt DemandedElts =
26380 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
26381
26383 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
26384 return SDValue();
26385
26386 break;
26387 }
26388 case ISD::XOR:
26389 return performXorCombine(N, DAG, DCI, Subtarget);
26390 case ISD::MUL:
26391 return performMulCombine(N, DAG, DCI, Subtarget);
26392 case ISD::SINT_TO_FP:
26393 case ISD::UINT_TO_FP:
26394 return performIntToFpCombine(N, DAG, DCI, Subtarget);
26395 case ISD::FP_TO_SINT:
26396 case ISD::FP_TO_UINT:
26399 return performFpToIntCombine(N, DAG, DCI, Subtarget);
26400 case ISD::OR:
26401 return performORCombine(N, DCI, Subtarget, *this);
26402 case ISD::AND:
26403 return performANDCombine(N, DCI);
26404 case ISD::FADD:
26405 return performFADDCombine(N, DCI);
26407 return performIntrinsicCombine(N, DCI, Subtarget);
26408 case ISD::ANY_EXTEND:
26409 case ISD::ZERO_EXTEND:
26410 case ISD::SIGN_EXTEND:
26411 return performExtendCombine(N, DCI, DAG);
26413 return performSignExtendInRegCombine(N, DCI, DAG);
26415 return performConcatVectorsCombine(N, DCI, DAG);
26417 return performExtractSubvectorCombine(N, DCI, DAG);
26419 return performInsertSubvectorCombine(N, DCI, DAG);
26420 case ISD::SELECT:
26421 return performSelectCombine(N, DCI);
26422 case ISD::VSELECT:
26423 return performVSelectCombine(N, DCI.DAG);
26424 case ISD::SETCC:
26425 return performSETCCCombine(N, DCI, DAG);
26426 case ISD::LOAD:
26427 return performLOADCombine(N, DCI, DAG, Subtarget);
26428 case ISD::STORE:
26429 return performSTORECombine(N, DCI, DAG, Subtarget);
26430 case ISD::MSTORE:
26431 return performMSTORECombine(N, DCI, DAG, Subtarget);
26432 case ISD::MGATHER:
26433 case ISD::MSCATTER:
26435 return performMaskedGatherScatterCombine(N, DCI, DAG);
26436 case ISD::FP_EXTEND:
26437 return performFPExtendCombine(N, DAG, DCI, Subtarget);
26438 case AArch64ISD::BRCOND:
26439 return performBRCONDCombine(N, DCI, DAG);
26440 case AArch64ISD::TBNZ:
26441 case AArch64ISD::TBZ:
26442 return performTBZCombine(N, DCI, DAG);
26443 case AArch64ISD::CSEL:
26444 return performCSELCombine(N, DCI, DAG);
26445 case AArch64ISD::DUP:
26450 return performDUPCombine(N, DCI);
26452 return performDupLane128Combine(N, DAG);
26453 case AArch64ISD::NVCAST:
26454 return performNVCASTCombine(N, DAG);
26455 case AArch64ISD::SPLICE:
26456 return performSpliceCombine(N, DAG);
26459 return performUnpackCombine(N, DAG, Subtarget);
26460 case AArch64ISD::UZP1:
26461 case AArch64ISD::UZP2:
26462 return performUzpCombine(N, DAG, Subtarget);
26464 return performSetccMergeZeroCombine(N, DCI);
26481 return performGLD1Combine(N, DAG);
26482 case AArch64ISD::VASHR:
26483 case AArch64ISD::VLSHR:
26484 return performVectorShiftCombine(N, *this, DCI);
26486 return performSunpkloCombine(N, DAG);
26487 case AArch64ISD::BSP:
26488 return performBSPExpandForSVE(N, DAG, Subtarget);
26490 return performInsertVectorEltCombine(N, DCI);
26492 return performExtractVectorEltCombine(N, DCI, Subtarget);
26493 case ISD::VECREDUCE_ADD:
26494 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
26495 case AArch64ISD::UADDV:
26496 return performUADDVCombine(N, DAG);
26497 case AArch64ISD::SMULL:
26498 case AArch64ISD::UMULL:
26499 case AArch64ISD::PMULL:
26500 return performMULLCombine(N, DCI, DAG);
26503 switch (N->getConstantOperandVal(1)) {
26504 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
26505 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
26506 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
26507 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
26508 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
26509 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
26510 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
26511 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
26512 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
26513 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
26514 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
26515 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
26516 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
26517 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
26518 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
26519 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
26521 case Intrinsic::aarch64_neon_ld2:
26522 case Intrinsic::aarch64_neon_ld3:
26523 case Intrinsic::aarch64_neon_ld4:
26524 case Intrinsic::aarch64_neon_ld1x2:
26525 case Intrinsic::aarch64_neon_ld1x3:
26526 case Intrinsic::aarch64_neon_ld1x4:
26527 case Intrinsic::aarch64_neon_ld2lane:
26528 case Intrinsic::aarch64_neon_ld3lane:
26529 case Intrinsic::aarch64_neon_ld4lane:
26530 case Intrinsic::aarch64_neon_ld2r:
26531 case Intrinsic::aarch64_neon_ld3r:
26532 case Intrinsic::aarch64_neon_ld4r:
26533 case Intrinsic::aarch64_neon_st2:
26534 case Intrinsic::aarch64_neon_st3:
26535 case Intrinsic::aarch64_neon_st4:
26536 case Intrinsic::aarch64_neon_st1x2:
26537 case Intrinsic::aarch64_neon_st1x3:
26538 case Intrinsic::aarch64_neon_st1x4:
26539 case Intrinsic::aarch64_neon_st2lane:
26540 case Intrinsic::aarch64_neon_st3lane:
26541 case Intrinsic::aarch64_neon_st4lane:
26542 return performNEONPostLDSTCombine(N, DCI, DAG);
26543 case Intrinsic::aarch64_sve_ldnt1:
26544 return performLDNT1Combine(N, DAG);
26545 case Intrinsic::aarch64_sve_ld1rq:
26546 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
26547 case Intrinsic::aarch64_sve_ld1ro:
26548 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
26549 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
26551 case Intrinsic::aarch64_sve_ldnt1_gather:
26553 case Intrinsic::aarch64_sve_ldnt1_gather_index:
26554 return performGatherLoadCombine(N, DAG,
26556 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
26558 case Intrinsic::aarch64_sve_ld1:
26560 case Intrinsic::aarch64_sve_ldnf1:
26562 case Intrinsic::aarch64_sve_ldff1:
26564 case Intrinsic::aarch64_sve_st1:
26565 return performST1Combine(N, DAG);
26566 case Intrinsic::aarch64_sve_stnt1:
26567 return performSTNT1Combine(N, DAG);
26568 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
26570 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
26572 case Intrinsic::aarch64_sve_stnt1_scatter:
26574 case Intrinsic::aarch64_sve_stnt1_scatter_index:
26576 case Intrinsic::aarch64_sve_ld1_gather:
26578 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
26579 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
26581 case Intrinsic::aarch64_sve_ld1q_gather_index:
26582 return performGatherLoadCombine(N, DAG,
26584 case Intrinsic::aarch64_sve_ld1_gather_index:
26585 return performGatherLoadCombine(N, DAG,
26587 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
26589 /*OnlyPackedOffsets=*/false);
26590 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
26592 /*OnlyPackedOffsets=*/false);
26593 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
26594 return performGatherLoadCombine(N, DAG,
26596 /*OnlyPackedOffsets=*/false);
26597 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
26598 return performGatherLoadCombine(N, DAG,
26600 /*OnlyPackedOffsets=*/false);
26601 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
26603 case Intrinsic::aarch64_sve_ldff1_gather:
26605 case Intrinsic::aarch64_sve_ldff1_gather_index:
26606 return performGatherLoadCombine(N, DAG,
26608 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
26609 return performGatherLoadCombine(N, DAG,
26611 /*OnlyPackedOffsets=*/false);
26612 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
26613 return performGatherLoadCombine(N, DAG,
26615 /*OnlyPackedOffsets=*/false);
26616 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
26617 return performGatherLoadCombine(N, DAG,
26619 /*OnlyPackedOffsets=*/false);
26620 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
26621 return performGatherLoadCombine(N, DAG,
26623 /*OnlyPackedOffsets=*/false);
26624 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
26625 return performGatherLoadCombine(N, DAG,
26627 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
26628 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
26630 case Intrinsic::aarch64_sve_st1q_scatter_index:
26632 case Intrinsic::aarch64_sve_st1_scatter:
26634 case Intrinsic::aarch64_sve_st1_scatter_index:
26636 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
26638 /*OnlyPackedOffsets=*/false);
26639 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
26641 /*OnlyPackedOffsets=*/false);
26642 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
26643 return performScatterStoreCombine(N, DAG,
26645 /*OnlyPackedOffsets=*/false);
26646 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
26647 return performScatterStoreCombine(N, DAG,
26649 /*OnlyPackedOffsets=*/false);
26650 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
26652 case Intrinsic::aarch64_rndr:
26653 case Intrinsic::aarch64_rndrrs: {
26654 unsigned IntrinsicID = N->getConstantOperandVal(1);
26655 auto Register =
26656 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
26657 : AArch64SysReg::RNDRRS);
26658 SDLoc DL(N);
26659 SDValue A = DAG.getNode(
26660 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other),
26661 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
26662 SDValue B = DAG.getNode(
26663 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
26664 DAG.getConstant(0, DL, MVT::i32),
26665 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
26666 return DAG.getMergeValues(
26667 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
26668 }
26669 case Intrinsic::aarch64_sme_ldr_zt:
26671 DAG.getVTList(MVT::Other), N->getOperand(0),
26672 N->getOperand(2), N->getOperand(3));
26673 case Intrinsic::aarch64_sme_str_zt:
26674 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
26675 DAG.getVTList(MVT::Other), N->getOperand(0),
26676 N->getOperand(2), N->getOperand(3));
26677 default:
26678 break;
26679 }
26680 break;
26681 case ISD::GlobalAddress:
26682 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
26683 case ISD::CTLZ:
26684 return performCTLZCombine(N, DAG, Subtarget);
26686 return performScalarToVectorCombine(N, DCI, DAG);
26687 }
26688 return SDValue();
26689}
26690
26691// Check if the return value is used as only a return value, as otherwise
26692// we can't perform a tail-call. In particular, we need to check for
26693// target ISD nodes that are returns and any other "odd" constructs
26694// that the generic analysis code won't necessarily catch.
26695bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
26696 SDValue &Chain) const {
26697 if (N->getNumValues() != 1)
26698 return false;
26699 if (!N->hasNUsesOfValue(1, 0))
26700 return false;
26701
26702 SDValue TCChain = Chain;
26703 SDNode *Copy = *N->user_begin();
26704 if (Copy->getOpcode() == ISD::CopyToReg) {
26705 // If the copy has a glue operand, we conservatively assume it isn't safe to
26706 // perform a tail call.
26707 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
26708 MVT::Glue)
26709 return false;
26710 TCChain = Copy->getOperand(0);
26711 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
26712 return false;
26713
26714 bool HasRet = false;
26715 for (SDNode *Node : Copy->users()) {
26716 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
26717 return false;
26718 HasRet = true;
26719 }
26720
26721 if (!HasRet)
26722 return false;
26723
26724 Chain = TCChain;
26725 return true;
26726}
26727
26728// Return whether the an instruction can potentially be optimized to a tail
26729// call. This will cause the optimizers to attempt to move, or duplicate,
26730// return instructions to help enable tail call optimizations for this
26731// instruction.
26732bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
26733 return CI->isTailCall();
26734}
26735
26736bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
26737 Register Offset, bool IsPre,
26738 MachineRegisterInfo &MRI) const {
26739 auto CstOffset = getIConstantVRegVal(Offset, MRI);
26740 if (!CstOffset || CstOffset->isZero())
26741 return false;
26742
26743 // All of the indexed addressing mode instructions take a signed 9 bit
26744 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
26745 // encodes the sign/indexing direction.
26746 return isInt<9>(CstOffset->getSExtValue());
26747}
26748
26749bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
26750 SDValue &Base,
26751 SDValue &Offset,
26752 SelectionDAG &DAG) const {
26753 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
26754 return false;
26755
26756 // Non-null if there is exactly one user of the loaded value (ignoring chain).
26757 SDNode *ValOnlyUser = nullptr;
26758 for (SDUse &U : N->uses()) {
26759 if (U.getResNo() == 1)
26760 continue; // Ignore chain.
26761 if (ValOnlyUser == nullptr)
26762 ValOnlyUser = U.getUser();
26763 else {
26764 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
26765 break;
26766 }
26767 }
26768
26769 auto IsUndefOrZero = [](SDValue V) {
26770 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
26771 };
26772
26773 // If the only user of the value is a scalable vector splat, it is
26774 // preferable to do a replicating load (ld1r*).
26775 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
26776 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
26777 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
26778 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
26779 return false;
26780
26781 Base = Op->getOperand(0);
26782 // All of the indexed addressing mode instructions take a signed
26783 // 9 bit immediate offset.
26784 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
26785 int64_t RHSC = RHS->getSExtValue();
26786 if (Op->getOpcode() == ISD::SUB)
26787 RHSC = -(uint64_t)RHSC;
26788 if (!isInt<9>(RHSC))
26789 return false;
26790 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
26791 // when dealing with subtraction.
26792 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
26793 return true;
26794 }
26795 return false;
26796}
26797
26798bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
26799 SDValue &Offset,
26801 SelectionDAG &DAG) const {
26802 EVT VT;
26803 SDValue Ptr;
26804 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
26805 VT = LD->getMemoryVT();
26806 Ptr = LD->getBasePtr();
26807 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
26808 VT = ST->getMemoryVT();
26809 Ptr = ST->getBasePtr();
26810 } else
26811 return false;
26812
26813 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
26814 return false;
26815 AM = ISD::PRE_INC;
26816 return true;
26817}
26818
26819bool AArch64TargetLowering::getPostIndexedAddressParts(
26821 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
26822 EVT VT;
26823 SDValue Ptr;
26824 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
26825 VT = LD->getMemoryVT();
26826 Ptr = LD->getBasePtr();
26827 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
26828 VT = ST->getMemoryVT();
26829 Ptr = ST->getBasePtr();
26830 } else
26831 return false;
26832
26833 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
26834 return false;
26835 // Post-indexing updates the base, so it's not a valid transform
26836 // if that's not the same as the load's pointer.
26837 if (Ptr != Base)
26838 return false;
26839 AM = ISD::POST_INC;
26840 return true;
26841}
26842
26845 SelectionDAG &DAG) {
26846 SDLoc DL(N);
26847 SDValue Op = N->getOperand(0);
26848 EVT VT = N->getValueType(0);
26849 [[maybe_unused]] EVT SrcVT = Op.getValueType();
26850 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
26851 "Must be bool vector.");
26852
26853 // Special handling for Clang's __builtin_convertvector. For vectors with <8
26854 // elements, it adds a vector concatenation with undef(s). If we encounter
26855 // this here, we can skip the concat.
26856 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
26857 bool AllUndef = true;
26858 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
26859 AllUndef &= Op.getOperand(I).isUndef();
26860
26861 if (AllUndef)
26862 Op = Op.getOperand(0);
26863 }
26864
26865 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
26866 if (VectorBits)
26867 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
26868}
26869
26872 SelectionDAG &DAG, EVT ExtendVT,
26873 EVT CastVT) {
26874 SDLoc DL(N);
26875 SDValue Op = N->getOperand(0);
26876 EVT VT = N->getValueType(0);
26877
26878 // Use SCALAR_TO_VECTOR for lane zero
26879 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
26880 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
26881 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
26882 Results.push_back(
26883 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
26884}
26885
26886void AArch64TargetLowering::ReplaceBITCASTResults(
26888 SDLoc DL(N);
26889 SDValue Op = N->getOperand(0);
26890 EVT VT = N->getValueType(0);
26891 EVT SrcVT = Op.getValueType();
26892
26893 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
26894 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
26895 return;
26896 }
26897
26898 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
26899 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
26900 return;
26901 }
26902
26903 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
26904 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
26905 return;
26906 }
26907
26908 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
26909 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
26910 "Expected fp->int bitcast!");
26911
26912 // Bitcasting between unpacked vector types of different element counts is
26913 // not a NOP because the live elements are laid out differently.
26914 // 01234567
26915 // e.g. nxv2i32 = XX??XX??
26916 // nxv4f16 = X?X?X?X?
26917 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
26918 return;
26919
26920 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
26921 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
26922 return;
26923 }
26924
26925 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
26926 !VT.isVector())
26927 return replaceBoolVectorBitcast(N, Results, DAG);
26928
26929 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
26930 return;
26931
26932 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
26933 DAG.getUNDEF(MVT::i32), Op);
26934 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
26935 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
26936}
26937
26939 SelectionDAG &DAG,
26940 const AArch64Subtarget *Subtarget) {
26941 EVT VT = N->getValueType(0);
26942 if (!VT.is256BitVector() ||
26944 !N->getFlags().hasAllowReassociation()) ||
26945 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
26946 VT.getScalarType() == MVT::bf16)
26947 return;
26948
26949 SDValue X = N->getOperand(0);
26950 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
26951 if (!Shuf) {
26952 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
26953 X = N->getOperand(1);
26954 if (!Shuf)
26955 return;
26956 }
26957
26958 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
26959 return;
26960
26961 // Check the mask is 1,0,3,2,5,4,...
26962 ArrayRef<int> Mask = Shuf->getMask();
26963 for (int I = 0, E = Mask.size(); I < E; I++)
26964 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
26965 return;
26966
26967 SDLoc DL(N);
26968 auto LoHi = DAG.SplitVector(X, DL);
26969 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
26970 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
26971 LoHi.first, LoHi.second);
26972
26973 // Shuffle the elements back into order.
26974 SmallVector<int> NMask;
26975 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
26976 NMask.push_back(I);
26977 NMask.push_back(I);
26978 }
26979 Results.push_back(
26980 DAG.getVectorShuffle(VT, DL,
26981 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
26982 DAG.getUNDEF(LoHi.first.getValueType())),
26983 DAG.getUNDEF(VT), NMask));
26984}
26985
26988 SelectionDAG &DAG, unsigned InterOp,
26989 unsigned AcrossOp) {
26990 EVT LoVT, HiVT;
26991 SDValue Lo, Hi;
26992 SDLoc dl(N);
26993 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
26994 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
26995 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
26996 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
26997 Results.push_back(SplitVal);
26998}
26999
27000void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27002 SDValue In = N->getOperand(0);
27003 EVT InVT = In.getValueType();
27004
27005 // Common code will handle these just fine.
27006 if (!InVT.isScalableVector() || !InVT.isInteger())
27007 return;
27008
27009 SDLoc DL(N);
27010 EVT VT = N->getValueType(0);
27011
27012 // The following checks bail if this is not a halving operation.
27013
27015
27016 if (InVT.getVectorElementCount() != (ResEC * 2))
27017 return;
27018
27019 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
27020 if (!CIndex)
27021 return;
27022
27023 unsigned Index = CIndex->getZExtValue();
27024 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
27025 return;
27026
27027 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27028 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27029
27030 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
27031 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
27032}
27033
27034// Create an even/odd pair of X registers holding integer value V.
27036 SDLoc dl(V.getNode());
27037 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
27038 if (DAG.getDataLayout().isBigEndian())
27039 std::swap (VLo, VHi);
27040 SDValue RegClass =
27041 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
27042 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
27043 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
27044 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
27045 return SDValue(
27046 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
27047}
27048
27051 SelectionDAG &DAG,
27052 const AArch64Subtarget *Subtarget) {
27053 assert(N->getValueType(0) == MVT::i128 &&
27054 "AtomicCmpSwap on types less than 128 should be legal");
27055
27056 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27057 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
27058 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
27059 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
27060 SDValue Ops[] = {
27061 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
27062 createGPRPairNode(DAG, N->getOperand(3)), // Store value
27063 N->getOperand(1), // Ptr
27064 N->getOperand(0), // Chain in
27065 };
27066
27067 unsigned Opcode;
27068 switch (MemOp->getMergedOrdering()) {
27070 Opcode = AArch64::CASPX;
27071 break;
27073 Opcode = AArch64::CASPAX;
27074 break;
27076 Opcode = AArch64::CASPLX;
27077 break;
27080 Opcode = AArch64::CASPALX;
27081 break;
27082 default:
27083 llvm_unreachable("Unexpected ordering!");
27084 }
27085
27086 MachineSDNode *CmpSwap = DAG.getMachineNode(
27087 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
27088 DAG.setNodeMemRefs(CmpSwap, {MemOp});
27089
27090 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
27091 if (DAG.getDataLayout().isBigEndian())
27092 std::swap(SubReg1, SubReg2);
27093 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
27094 SDValue(CmpSwap, 0));
27095 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
27096 SDValue(CmpSwap, 0));
27097 Results.push_back(
27098 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27099 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
27100 return;
27101 }
27102
27103 unsigned Opcode;
27104 switch (MemOp->getMergedOrdering()) {
27106 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
27107 break;
27109 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
27110 break;
27112 Opcode = AArch64::CMP_SWAP_128_RELEASE;
27113 break;
27116 Opcode = AArch64::CMP_SWAP_128;
27117 break;
27118 default:
27119 llvm_unreachable("Unexpected ordering!");
27120 }
27121
27122 SDLoc DL(N);
27123 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
27124 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
27125 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
27126 New.first, New.second, N->getOperand(0)};
27127 SDNode *CmpSwap = DAG.getMachineNode(
27128 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
27129 Ops);
27130 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
27131
27132 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27133 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
27134 Results.push_back(SDValue(CmpSwap, 3));
27135}
27136
27137static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
27138 AtomicOrdering Ordering) {
27139 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
27140 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
27141 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
27142 // ATOMIC_LOAD_CLR at any point.
27143 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
27144 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
27145 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
27146 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
27147
27148 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27149 // The operand will need to be XORed in a separate step.
27150 switch (Ordering) {
27152 return AArch64::LDCLRP;
27153 break;
27155 return AArch64::LDCLRPA;
27156 break;
27158 return AArch64::LDCLRPL;
27159 break;
27162 return AArch64::LDCLRPAL;
27163 break;
27164 default:
27165 llvm_unreachable("Unexpected ordering!");
27166 }
27167 }
27168
27169 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
27170 switch (Ordering) {
27172 return AArch64::LDSETP;
27173 break;
27175 return AArch64::LDSETPA;
27176 break;
27178 return AArch64::LDSETPL;
27179 break;
27182 return AArch64::LDSETPAL;
27183 break;
27184 default:
27185 llvm_unreachable("Unexpected ordering!");
27186 }
27187 }
27188
27189 if (ISDOpcode == ISD::ATOMIC_SWAP) {
27190 switch (Ordering) {
27192 return AArch64::SWPP;
27193 break;
27195 return AArch64::SWPPA;
27196 break;
27198 return AArch64::SWPPL;
27199 break;
27202 return AArch64::SWPPAL;
27203 break;
27204 default:
27205 llvm_unreachable("Unexpected ordering!");
27206 }
27207 }
27208
27209 llvm_unreachable("Unexpected ISDOpcode!");
27210}
27211
27214 SelectionDAG &DAG,
27215 const AArch64Subtarget *Subtarget) {
27216 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
27217 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
27218 // rather than the CASP instructions, because CASP has register classes for
27219 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
27220 // to present them as single operands. LSE128 instructions use the GPR64
27221 // register class (because the pair does not have to be sequential), like
27222 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
27223
27224 assert(N->getValueType(0) == MVT::i128 &&
27225 "AtomicLoadXXX on types less than 128 should be legal");
27226
27227 if (!Subtarget->hasLSE128())
27228 return;
27229
27230 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
27231 const SDValue &Chain = N->getOperand(0);
27232 const SDValue &Ptr = N->getOperand(1);
27233 const SDValue &Val128 = N->getOperand(2);
27234 std::pair<SDValue, SDValue> Val2x64 =
27235 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
27236
27237 const unsigned ISDOpcode = N->getOpcode();
27238 const unsigned MachineOpcode =
27239 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
27240
27241 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27242 SDLoc dl(Val128);
27243 Val2x64.first =
27244 DAG.getNode(ISD::XOR, dl, MVT::i64,
27245 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
27246 Val2x64.second =
27247 DAG.getNode(ISD::XOR, dl, MVT::i64,
27248 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
27249 }
27250
27251 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
27252 if (DAG.getDataLayout().isBigEndian())
27253 std::swap(Ops[0], Ops[1]);
27254
27255 MachineSDNode *AtomicInst =
27256 DAG.getMachineNode(MachineOpcode, SDLoc(N),
27257 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
27258
27259 DAG.setNodeMemRefs(AtomicInst, {MemOp});
27260
27261 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
27262 if (DAG.getDataLayout().isBigEndian())
27263 std::swap(Lo, Hi);
27264
27265 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
27266 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
27267}
27268
27269void AArch64TargetLowering::ReplaceNodeResults(
27271 switch (N->getOpcode()) {
27272 default:
27273 llvm_unreachable("Don't know how to custom expand this");
27274 case ISD::BITCAST:
27275 ReplaceBITCASTResults(N, Results, DAG);
27276 return;
27277 case ISD::VECREDUCE_ADD:
27282 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
27283 return;
27285 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
27286 Results.push_back(Res);
27287 return;
27288 case ISD::ADD:
27289 case ISD::FADD:
27290 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
27291 return;
27292
27293 case ISD::CTPOP:
27294 case ISD::PARITY:
27295 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
27296 Results.push_back(Result);
27297 return;
27298 case AArch64ISD::SADDV:
27300 return;
27301 case AArch64ISD::UADDV:
27303 return;
27304 case AArch64ISD::SMINV:
27306 return;
27307 case AArch64ISD::UMINV:
27309 return;
27310 case AArch64ISD::SMAXV:
27312 return;
27313 case AArch64ISD::UMAXV:
27315 return;
27316 case ISD::MULHS:
27318 Results.push_back(
27319 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
27320 return;
27321 case ISD::MULHU:
27323 Results.push_back(
27324 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
27325 return;
27326 case ISD::FP_TO_UINT:
27327 case ISD::FP_TO_SINT:
27330 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
27331 // Let normal code take care of it by not adding anything to Results.
27332 return;
27334 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
27335 return;
27337 assert(N->getValueType(0) != MVT::i128 &&
27338 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
27339 break;
27342 case ISD::ATOMIC_SWAP: {
27343 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
27344 "Expected 128-bit atomicrmw.");
27345 // These need custom type legalisation so we go directly to instruction.
27346 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
27347 return;
27348 }
27349 case ISD::ATOMIC_LOAD:
27350 case ISD::LOAD: {
27351 MemSDNode *LoadNode = cast<MemSDNode>(N);
27352 EVT MemVT = LoadNode->getMemoryVT();
27353 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
27354 // targets.
27355 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
27356 MemVT.getSizeInBits() == 256u &&
27357 (MemVT.getScalarSizeInBits() == 8u ||
27358 MemVT.getScalarSizeInBits() == 16u ||
27359 MemVT.getScalarSizeInBits() == 32u ||
27360 MemVT.getScalarSizeInBits() == 64u)) {
27361
27364 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
27365 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
27366 MVT::Other}),
27367 {LoadNode->getChain(), LoadNode->getBasePtr()},
27368 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
27369
27370 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
27371 Result.getValue(0), Result.getValue(1));
27372 Results.append({Pair, Result.getValue(2) /* Chain */});
27373 return;
27374 }
27375
27376 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
27377 LoadNode->getMemoryVT() != MVT::i128) {
27378 // Non-volatile or atomic loads are optimized later in AArch64's load/store
27379 // optimizer.
27380 return;
27381 }
27382
27383 if (SDValue(N, 0).getValueType() == MVT::i128) {
27384 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
27385 bool isLoadAcquire =
27387 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
27388
27389 if (isLoadAcquire)
27390 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
27391
27393 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
27394 {LoadNode->getChain(), LoadNode->getBasePtr()},
27395 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
27396
27397 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
27398
27399 SDValue Pair =
27400 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
27401 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
27402 Results.append({Pair, Result.getValue(2) /* Chain */});
27403 }
27404 return;
27405 }
27407 ReplaceExtractSubVectorResults(N, Results, DAG);
27408 return;
27411 // Custom lowering has been requested for INSERT_SUBVECTOR and
27412 // CONCAT_VECTORS -- but delegate to common code for result type
27413 // legalisation
27414 return;
27416 EVT VT = N->getValueType(0);
27417
27418 Intrinsic::ID IntID =
27419 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
27420 switch (IntID) {
27421 default:
27422 return;
27423 case Intrinsic::aarch64_sve_clasta_n: {
27424 assert((VT == MVT::i8 || VT == MVT::i16) &&
27425 "custom lowering for unexpected type");
27426 SDLoc DL(N);
27427 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
27428 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
27429 N->getOperand(1), Op2, N->getOperand(3));
27430 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27431 return;
27432 }
27433 case Intrinsic::aarch64_sve_clastb_n: {
27434 assert((VT == MVT::i8 || VT == MVT::i16) &&
27435 "custom lowering for unexpected type");
27436 SDLoc DL(N);
27437 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
27438 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
27439 N->getOperand(1), Op2, N->getOperand(3));
27440 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27441 return;
27442 }
27443 case Intrinsic::aarch64_sve_lasta: {
27444 assert((VT == MVT::i8 || VT == MVT::i16) &&
27445 "custom lowering for unexpected type");
27446 SDLoc DL(N);
27447 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
27448 N->getOperand(1), N->getOperand(2));
27449 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27450 return;
27451 }
27452 case Intrinsic::aarch64_sve_lastb: {
27453 assert((VT == MVT::i8 || VT == MVT::i16) &&
27454 "custom lowering for unexpected type");
27455 SDLoc DL(N);
27456 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
27457 N->getOperand(1), N->getOperand(2));
27458 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27459 return;
27460 }
27461 case Intrinsic::aarch64_sme_in_streaming_mode: {
27462 SDLoc DL(N);
27463 SDValue Chain = DAG.getEntryNode();
27464 SDValue RuntimePStateSM =
27465 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
27466 Results.push_back(
27467 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
27468 return;
27469 }
27470 case Intrinsic::experimental_vector_match:
27471 case Intrinsic::get_active_lane_mask: {
27472 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
27473 return;
27474
27475 // NOTE: Only trivial type promotion is supported.
27476 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
27477 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
27478 return;
27479
27480 SDLoc DL(N);
27481 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
27482 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
27483 return;
27484 }
27485 }
27486 }
27487 case ISD::READ_REGISTER: {
27488 SDLoc DL(N);
27489 assert(N->getValueType(0) == MVT::i128 &&
27490 "READ_REGISTER custom lowering is only for 128-bit sysregs");
27491 SDValue Chain = N->getOperand(0);
27492 SDValue SysRegName = N->getOperand(1);
27493
27494 SDValue Result = DAG.getNode(
27495 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
27496 Chain, SysRegName);
27497
27498 // Sysregs are not endian. Result.getValue(0) always contains the lower half
27499 // of the 128-bit System Register value.
27500 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
27501 Result.getValue(0), Result.getValue(1));
27502 Results.push_back(Pair);
27503 Results.push_back(Result.getValue(2)); // Chain
27504 return;
27505 }
27506 }
27507}
27508
27510 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
27512 return true;
27513}
27514
27515unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
27516 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
27517 // reciprocal if there are three or more FDIVs.
27518 return 3;
27519}
27520
27523 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
27524 // v4i16, v2i32 instead of to promote.
27525 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
27526 VT == MVT::v1f32)
27527 return TypeWidenVector;
27528
27530}
27531
27532// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
27533// provided the address is 16-byte aligned.
27535 if (!Subtarget->hasLSE2())
27536 return false;
27537
27538 if (auto LI = dyn_cast<LoadInst>(I))
27539 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27540 LI->getAlign() >= Align(16);
27541
27542 if (auto SI = dyn_cast<StoreInst>(I))
27543 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27544 SI->getAlign() >= Align(16);
27545
27546 return false;
27547}
27548
27550 if (!Subtarget->hasLSE128())
27551 return false;
27552
27553 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
27554 // will clobber the two registers.
27555 if (const auto *SI = dyn_cast<StoreInst>(I))
27556 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27557 SI->getAlign() >= Align(16) &&
27558 (SI->getOrdering() == AtomicOrdering::Release ||
27559 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
27560
27561 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
27562 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27563 RMW->getAlign() >= Align(16) &&
27564 (RMW->getOperation() == AtomicRMWInst::Xchg ||
27565 RMW->getOperation() == AtomicRMWInst::And ||
27566 RMW->getOperation() == AtomicRMWInst::Or);
27567
27568 return false;
27569}
27570
27572 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
27573 return false;
27574
27575 if (auto LI = dyn_cast<LoadInst>(I))
27576 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
27577 LI->getAlign() >= Align(16) &&
27578 LI->getOrdering() == AtomicOrdering::Acquire;
27579
27580 if (auto SI = dyn_cast<StoreInst>(I))
27581 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
27582 SI->getAlign() >= Align(16) &&
27583 SI->getOrdering() == AtomicOrdering::Release;
27584
27585 return false;
27586}
27587
27589 const Instruction *I) const {
27591 return false;
27593 return false;
27595 return true;
27596 return false;
27597}
27598
27600 const Instruction *I) const {
27601 // Store-Release instructions only provide seq_cst guarantees when paired with
27602 // Load-Acquire instructions. MSVC CRT does not use these instructions to
27603 // implement seq_cst loads and stores, so we need additional explicit fences
27604 // after memory writes.
27605 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27606 return false;
27607
27608 switch (I->getOpcode()) {
27609 default:
27610 return false;
27611 case Instruction::AtomicCmpXchg:
27612 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
27614 case Instruction::AtomicRMW:
27615 return cast<AtomicRMWInst>(I)->getOrdering() ==
27617 case Instruction::Store:
27618 return cast<StoreInst>(I)->getOrdering() ==
27620 }
27621}
27622
27623// Loads and stores less than 128-bits are already atomic; ones above that
27624// are doomed anyway, so defer to the default libcall and blame the OS when
27625// things go wrong.
27628 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
27629 if (Size != 128)
27631 if (isOpSuitableForRCPC3(SI))
27633 if (isOpSuitableForLSE128(SI))
27635 if (isOpSuitableForLDPSTP(SI))
27638}
27639
27640// Loads and stores less than 128-bits are already atomic; ones above that
27641// are doomed anyway, so defer to the default libcall and blame the OS when
27642// things go wrong.
27645 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
27646
27647 if (Size != 128)
27649 if (isOpSuitableForRCPC3(LI))
27651 // No LSE128 loads
27652 if (isOpSuitableForLDPSTP(LI))
27654
27655 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27656 // implement atomicrmw without spilling. If the target address is also on the
27657 // stack and close enough to the spill slot, this can lead to a situation
27658 // where the monitor always gets cleared and the atomic operation can never
27659 // succeed. So at -O0 lower this operation to a CAS loop.
27660 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
27662
27663 // Using CAS for an atomic load has a better chance of succeeding under high
27664 // contention situations. So use it if available.
27665 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
27667}
27668
27669// Return true if the atomic operation expansion will lower to use a library
27670// call, and is thus ineligible to use an LLSC expansion.
27671static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
27672 const AtomicRMWInst *RMW) {
27673 if (!RMW->isFloatingPointOperation())
27674 return false;
27675 switch (RMW->getType()->getScalarType()->getTypeID()) {
27676 case Type::FloatTyID:
27677 case Type::DoubleTyID:
27678 case Type::HalfTyID:
27679 case Type::BFloatTyID:
27680 // Will use soft float
27681 return !Subtarget.hasFPARMv8();
27682 default:
27683 // fp128 will emit library calls.
27684 return true;
27685 }
27686
27687 llvm_unreachable("covered type switch");
27688}
27689
27690// The "default" for integer RMW operations is to expand to an LL/SC loop.
27691// However, with the LSE instructions (or outline-atomics mode, which provides
27692// library routines in place of the LSE-instructions), we can directly emit many
27693// operations instead.
27696 Type *Ty = AI->getType();
27697 unsigned Size = Ty->getPrimitiveSizeInBits();
27698 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
27699
27700 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
27704 if (CanUseLSE128)
27706
27707 // Nand is not supported in LSE.
27708 // Leave 128 bits to LLSC or CmpXChg.
27709 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
27710 !AI->isFloatingPointOperation()) {
27711 if (Subtarget->hasLSE())
27713 if (Subtarget->outlineAtomics()) {
27714 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
27715 // Don't outline them unless
27716 // (1) high level <atomic> support approved:
27717 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
27718 // (2) low level libgcc and compiler-rt support implemented by:
27719 // min/max outline atomics helpers
27720 if (AI->getOperation() != AtomicRMWInst::Min &&
27725 }
27726 }
27727 }
27728
27729 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27730 // implement atomicrmw without spilling. If the target address is also on the
27731 // stack and close enough to the spill slot, this can lead to a situation
27732 // where the monitor always gets cleared and the atomic operation can never
27733 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
27734 // we have a single CAS instruction that can replace the loop.
27736 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
27738
27740}
27741
27744 AtomicCmpXchgInst *AI) const {
27745 // If subtarget has LSE, leave cmpxchg intact for codegen.
27746 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
27748 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
27749 // implement cmpxchg without spilling. If the address being exchanged is also
27750 // on the stack and close enough to the spill slot, this can lead to a
27751 // situation where the monitor always gets cleared and the atomic operation
27752 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
27753 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
27755
27756 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
27757 // it.
27759 if (Size > 64)
27761
27763}
27764
27766 Type *ValueTy, Value *Addr,
27767 AtomicOrdering Ord) const {
27768 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27769 bool IsAcquire = isAcquireOrStronger(Ord);
27770
27771 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
27772 // intrinsic must return {i64, i64} and we have to recombine them into a
27773 // single i128 here.
27774 if (ValueTy->getPrimitiveSizeInBits() == 128) {
27776 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
27777
27778 Value *LoHi =
27779 Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi");
27780
27781 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
27782 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
27783
27784 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
27785 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
27786 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
27787
27788 Value *Or = Builder.CreateOr(
27789 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
27790 return Builder.CreateBitCast(Or, ValueTy);
27791 }
27792
27793 Type *Tys[] = { Addr->getType() };
27795 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
27796
27797 const DataLayout &DL = M->getDataLayout();
27798 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
27799 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
27800 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
27801 Attribute::ElementType, IntEltTy));
27802 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
27803
27804 return Builder.CreateBitCast(Trunc, ValueTy);
27805}
27806
27808 IRBuilderBase &Builder) const {
27809 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
27810}
27811
27813 Value *Val, Value *Addr,
27814 AtomicOrdering Ord) const {
27815 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27816 bool IsRelease = isReleaseOrStronger(Ord);
27817
27818 // Since the intrinsics must have legal type, the i128 intrinsics take two
27819 // parameters: "i64, i64". We must marshal Val into the appropriate form
27820 // before the call.
27821 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
27823 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
27825 Type *Int64Ty = Type::getInt64Ty(M->getContext());
27826 Type *Int128Ty = Type::getInt128Ty(M->getContext());
27827
27828 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
27829
27830 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
27831 Value *Hi =
27832 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
27833 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
27834 }
27835
27837 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
27838 Type *Tys[] = { Addr->getType() };
27840
27841 const DataLayout &DL = M->getDataLayout();
27842 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
27843 Val = Builder.CreateBitCast(Val, IntValTy);
27844
27845 CallInst *CI = Builder.CreateCall(
27846 Stxr, {Builder.CreateZExtOrBitCast(
27847 Val, Stxr->getFunctionType()->getParamType(0)),
27848 Addr});
27849 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
27850 Attribute::ElementType, Val->getType()));
27851 return CI;
27852}
27853
27855 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
27856 const DataLayout &DL) const {
27857 if (!Ty->isArrayTy()) {
27858 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
27859 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
27860 }
27861
27862 // All non aggregate members of the type must have the same type
27863 SmallVector<EVT> ValueVTs;
27864 ComputeValueVTs(*this, DL, Ty, ValueVTs);
27865 return all_equal(ValueVTs);
27866}
27867
27868bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
27869 EVT) const {
27870 return false;
27871}
27872
27873static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
27874 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
27875 Function *ThreadPointerFunc =
27876 Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer);
27877 return IRB.CreatePointerCast(
27878 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
27879 Offset),
27880 IRB.getPtrTy(0));
27881}
27882
27884 // Android provides a fixed TLS slot for the stack cookie. See the definition
27885 // of TLS_SLOT_STACK_GUARD in
27886 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
27887 if (Subtarget->isTargetAndroid())
27888 return UseTlsOffset(IRB, 0x28);
27889
27890 // Fuchsia is similar.
27891 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
27892 if (Subtarget->isTargetFuchsia())
27893 return UseTlsOffset(IRB, -0x10);
27894
27896}
27897
27899 // MSVC CRT provides functionalities for stack protection.
27900 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
27901 // MSVC CRT has a global variable holding security cookie.
27902 M.getOrInsertGlobal("__security_cookie",
27903 PointerType::getUnqual(M.getContext()));
27904
27905 // MSVC CRT has a function to validate security cookie.
27906 FunctionCallee SecurityCheckCookie =
27907 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
27908 Type::getVoidTy(M.getContext()),
27909 PointerType::getUnqual(M.getContext()));
27910 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
27911 F->setCallingConv(CallingConv::Win64);
27912 F->addParamAttr(0, Attribute::AttrKind::InReg);
27913 }
27914 return;
27915 }
27917}
27918
27920 // MSVC CRT has a global variable holding security cookie.
27921 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27922 return M.getGlobalVariable("__security_cookie");
27924}
27925
27927 // MSVC CRT has a function to validate security cookie.
27928 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27929 return M.getFunction(Subtarget->getSecurityCheckCookieName());
27931}
27932
27933Value *
27935 // Android provides a fixed TLS slot for the SafeStack pointer. See the
27936 // definition of TLS_SLOT_SAFESTACK in
27937 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
27938 if (Subtarget->isTargetAndroid())
27939 return UseTlsOffset(IRB, 0x48);
27940
27941 // Fuchsia is similar.
27942 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
27943 if (Subtarget->isTargetFuchsia())
27944 return UseTlsOffset(IRB, -0x8);
27945
27947}
27948
27949/// If a physical register, this returns the register that receives the
27950/// exception address on entry to an EH pad.
27952 const Constant *PersonalityFn) const {
27953 // FIXME: This is a guess. Has this been defined yet?
27954 return AArch64::X0;
27955}
27956
27957/// If a physical register, this returns the register that receives the
27958/// exception typeid on entry to a landing pad.
27960 const Constant *PersonalityFn) const {
27961 // FIXME: This is a guess. Has this been defined yet?
27962 return AArch64::X1;
27963}
27964
27966 const Instruction &AndI) const {
27967 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
27968 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
27969 // may be beneficial to sink in other cases, but we would have to check that
27970 // the cmp would not get folded into the br to form a cbz for these to be
27971 // beneficial.
27972 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
27973 if (!Mask)
27974 return false;
27975 return Mask->getValue().isPowerOf2();
27976}
27977
27981 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
27982 SelectionDAG &DAG) const {
27983 // Does baseline recommend not to perform the fold by default?
27985 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
27986 return false;
27987 // Else, if this is a vector shift, prefer 'shl'.
27988 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
27989}
27990
27993 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
27995 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
27998 ExpansionFactor);
27999}
28000
28002 // Update IsSplitCSR in AArch64unctionInfo.
28003 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28004 AFI->setIsSplitCSR(true);
28005}
28006
28008 MachineBasicBlock *Entry,
28009 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
28010 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28011 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
28012 if (!IStart)
28013 return;
28014
28015 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28016 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28017 MachineBasicBlock::iterator MBBI = Entry->begin();
28018 for (const MCPhysReg *I = IStart; *I; ++I) {
28019 const TargetRegisterClass *RC = nullptr;
28020 if (AArch64::GPR64RegClass.contains(*I))
28021 RC = &AArch64::GPR64RegClass;
28022 else if (AArch64::FPR64RegClass.contains(*I))
28023 RC = &AArch64::FPR64RegClass;
28024 else
28025 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28026
28027 Register NewVR = MRI->createVirtualRegister(RC);
28028 // Create copy from CSR to a virtual register.
28029 // FIXME: this currently does not emit CFI pseudo-instructions, it works
28030 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28031 // nounwind. If we want to generalize this later, we may need to emit
28032 // CFI pseudo-instructions.
28033 assert(Entry->getParent()->getFunction().hasFnAttribute(
28034 Attribute::NoUnwind) &&
28035 "Function should be nounwind in insertCopiesSplitCSR!");
28036 Entry->addLiveIn(*I);
28037 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
28038 .addReg(*I);
28039
28040 // Insert the copy-back instructions right before the terminator.
28041 for (auto *Exit : Exits)
28042 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
28043 TII->get(TargetOpcode::COPY), *I)
28044 .addReg(NewVR);
28045 }
28046}
28047
28049 // Integer division on AArch64 is expensive. However, when aggressively
28050 // optimizing for code size, we prefer to use a div instruction, as it is
28051 // usually smaller than the alternative sequence.
28052 // The exception to this is vector division. Since AArch64 doesn't have vector
28053 // integer division, leaving the division as-is is a loss even in terms of
28054 // size, because it will have to be scalarized, while the alternative code
28055 // sequence can be performed in vector form.
28056 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
28057 return OptSize && !VT.isVector();
28058}
28059
28061 const MachineFunction &MF) const {
28062 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
28063 // In future, we could allow this when SVE is available, but currently,
28064 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
28065 // the general lowering may introduce stack spills/reloads).
28066 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
28067 return false;
28068
28069 // Do not merge to float value size (128 bytes) if no implicit float attribute
28070 // is set.
28071 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
28072 return !NoFloat || MemVT.getSizeInBits() <= 64;
28073}
28074
28076 // We want inc-of-add for scalars and sub-of-not for vectors.
28077 return VT.isScalarInteger();
28078}
28079
28081 EVT VT) const {
28082 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
28083 // legalize.
28084 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
28085 return false;
28086 if (FPVT == MVT::v8bf16)
28087 return false;
28088 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
28089}
28090
28092 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
28093 // avoid vselect becoming bsl / unrolling.
28094 return !VT.isFixedLengthVector();
28095}
28096
28100 const TargetInstrInfo *TII) const {
28101 assert(MBBI->isCall() && MBBI->getCFIType() &&
28102 "Invalid call instruction for a KCFI check");
28103
28104 switch (MBBI->getOpcode()) {
28105 case AArch64::BLR:
28106 case AArch64::BLRNoIP:
28107 case AArch64::TCRETURNri:
28108 case AArch64::TCRETURNrix16x17:
28109 case AArch64::TCRETURNrix17:
28110 case AArch64::TCRETURNrinotx16:
28111 break;
28112 default:
28113 llvm_unreachable("Unexpected CFI call opcode");
28114 }
28115
28116 MachineOperand &Target = MBBI->getOperand(0);
28117 assert(Target.isReg() && "Invalid target operand for an indirect call");
28118 Target.setIsRenamable(false);
28119
28120 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
28121 .addReg(Target.getReg())
28122 .addImm(MBBI->getCFIType())
28123 .getInstr();
28124}
28125
28127 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
28128}
28129
28130unsigned
28132 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
28133 return getPointerTy(DL).getSizeInBits();
28134
28135 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
28136}
28137
28138void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
28139 MachineFrameInfo &MFI = MF.getFrameInfo();
28140 // If we have any vulnerable SVE stack objects then the stack protector
28141 // needs to be placed at the top of the SVE stack area, as the SVE locals
28142 // are placed above the other locals, so we allocate it as if it were a
28143 // scalable vector.
28144 // FIXME: It may be worthwhile having a specific interface for this rather
28145 // than doing it here in finalizeLowering.
28146 if (MFI.hasStackProtectorIndex()) {
28147 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
28153 break;
28154 }
28155 }
28156 }
28159}
28160
28161// Unlike X86, we let frame lowering assign offsets to all catch objects.
28163 return false;
28164}
28165
28166bool AArch64TargetLowering::shouldLocalize(
28167 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
28168 auto &MF = *MI.getMF();
28169 auto &MRI = MF.getRegInfo();
28170 auto maxUses = [](unsigned RematCost) {
28171 // A cost of 1 means remats are basically free.
28172 if (RematCost == 1)
28173 return std::numeric_limits<unsigned>::max();
28174 if (RematCost == 2)
28175 return 2U;
28176
28177 // Remat is too expensive, only sink if there's one user.
28178 if (RematCost > 2)
28179 return 1U;
28180 llvm_unreachable("Unexpected remat cost");
28181 };
28182
28183 unsigned Opc = MI.getOpcode();
28184 switch (Opc) {
28185 case TargetOpcode::G_GLOBAL_VALUE: {
28186 // On Darwin, TLS global vars get selected into function calls, which
28187 // we don't want localized, as they can get moved into the middle of a
28188 // another call sequence.
28189 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
28190 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
28191 return false;
28192 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
28193 }
28194 case TargetOpcode::G_FCONSTANT:
28195 case TargetOpcode::G_CONSTANT: {
28196 const ConstantInt *CI;
28197 unsigned AdditionalCost = 0;
28198
28199 if (Opc == TargetOpcode::G_CONSTANT)
28200 CI = MI.getOperand(1).getCImm();
28201 else {
28202 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
28203 // We try to estimate cost of 32/64b fpimms, as they'll likely be
28204 // materialized as integers.
28205 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
28206 break;
28207 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
28208 bool OptForSize =
28211 OptForSize))
28212 return true; // Constant should be cheap.
28213 CI =
28214 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
28215 // FP materialization also costs an extra move, from gpr to fpr.
28216 AdditionalCost = 1;
28217 }
28218 APInt Imm = CI->getValue();
28221 assert(Cost.isValid() && "Expected a valid imm cost");
28222
28223 unsigned RematCost = *Cost.getValue();
28224 RematCost += AdditionalCost;
28225 Register Reg = MI.getOperand(0).getReg();
28226 unsigned MaxUses = maxUses(RematCost);
28227 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
28228 if (MaxUses == std::numeric_limits<unsigned>::max())
28229 --MaxUses;
28230 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
28231 }
28232 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
28233 // localizable.
28234 case AArch64::ADRP:
28235 case AArch64::G_ADD_LOW:
28236 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
28237 case TargetOpcode::G_PTR_ADD:
28238 return true;
28239 default:
28240 break;
28241 }
28243}
28244
28246 // Fallback for scalable vectors.
28247 // Note that if EnableSVEGISel is true, we allow scalable vector types for
28248 // all instructions, regardless of whether they are actually supported.
28249 if (!EnableSVEGISel) {
28250 if (Inst.getType()->isScalableTy()) {
28251 return true;
28252 }
28253
28254 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
28255 if (Inst.getOperand(i)->getType()->isScalableTy())
28256 return true;
28257
28258 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
28259 if (AI->getAllocatedType()->isScalableTy())
28260 return true;
28261 }
28262 }
28263
28264 // Checks to allow the use of SME instructions
28265 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
28266 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
28267 auto CalleeAttrs = SMEAttrs(*Base);
28268 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
28269 CallerAttrs.requiresLazySave(CalleeAttrs) ||
28270 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
28271 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs))
28272 return true;
28273 }
28274 return false;
28275}
28276
28277// Return the largest legal scalable vector type that matches VT's element type.
28281 "Expected legal fixed length vector!");
28282 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28283 default:
28284 llvm_unreachable("unexpected element type for SVE container");
28285 case MVT::i8:
28286 return EVT(MVT::nxv16i8);
28287 case MVT::i16:
28288 return EVT(MVT::nxv8i16);
28289 case MVT::i32:
28290 return EVT(MVT::nxv4i32);
28291 case MVT::i64:
28292 return EVT(MVT::nxv2i64);
28293 case MVT::bf16:
28294 return EVT(MVT::nxv8bf16);
28295 case MVT::f16:
28296 return EVT(MVT::nxv8f16);
28297 case MVT::f32:
28298 return EVT(MVT::nxv4f32);
28299 case MVT::f64:
28300 return EVT(MVT::nxv2f64);
28301 }
28302}
28303
28304// Return a PTRUE with active lanes corresponding to the extent of VT.
28306 EVT VT) {
28309 "Expected legal fixed length vector!");
28310
28311 std::optional<unsigned> PgPattern =
28313 assert(PgPattern && "Unexpected element count for SVE predicate");
28314
28315 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
28316 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
28317 // variants of instructions when available.
28318 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28319 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28320 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28321 if (MaxSVESize && MinSVESize == MaxSVESize &&
28322 MaxSVESize == VT.getSizeInBits())
28323 PgPattern = AArch64SVEPredPattern::all;
28324
28325 MVT MaskVT;
28326 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28327 default:
28328 llvm_unreachable("unexpected element type for SVE predicate");
28329 case MVT::i8:
28330 MaskVT = MVT::nxv16i1;
28331 break;
28332 case MVT::i16:
28333 case MVT::f16:
28334 case MVT::bf16:
28335 MaskVT = MVT::nxv8i1;
28336 break;
28337 case MVT::i32:
28338 case MVT::f32:
28339 MaskVT = MVT::nxv4i1;
28340 break;
28341 case MVT::i64:
28342 case MVT::f64:
28343 MaskVT = MVT::nxv2i1;
28344 break;
28345 }
28346
28347 return getPTrue(DAG, DL, MaskVT, *PgPattern);
28348}
28349
28351 EVT VT) {
28353 "Expected legal scalable vector!");
28354 auto PredTy = VT.changeVectorElementType(MVT::i1);
28355 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
28356}
28357
28359 if (VT.isFixedLengthVector())
28360 return getPredicateForFixedLengthVector(DAG, DL, VT);
28361
28362 return getPredicateForScalableVector(DAG, DL, VT);
28363}
28364
28365// Grow V to consume an entire SVE register.
28367 assert(VT.isScalableVector() &&
28368 "Expected to convert into a scalable vector!");
28369 assert(V.getValueType().isFixedLengthVector() &&
28370 "Expected a fixed length vector operand!");
28371 SDLoc DL(V);
28372 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28373 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
28374}
28375
28376// Shrink V so it's just big enough to maintain a VT's worth of data.
28379 "Expected to convert into a fixed length vector!");
28380 assert(V.getValueType().isScalableVector() &&
28381 "Expected a scalable vector operand!");
28382 SDLoc DL(V);
28383 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28384 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
28385}
28386
28387// Convert all fixed length vector loads larger than NEON to masked_loads.
28388SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
28389 SDValue Op, SelectionDAG &DAG) const {
28390 auto Load = cast<LoadSDNode>(Op);
28391
28392 SDLoc DL(Op);
28393 EVT VT = Op.getValueType();
28394 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28395 EVT LoadVT = ContainerVT;
28396 EVT MemVT = Load->getMemoryVT();
28397
28398 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28399
28400 if (VT.isFloatingPoint()) {
28401 LoadVT = ContainerVT.changeTypeToInteger();
28402 MemVT = MemVT.changeTypeToInteger();
28403 }
28404
28405 SDValue NewLoad = DAG.getMaskedLoad(
28406 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
28407 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
28408 Load->getAddressingMode(), Load->getExtensionType());
28409
28410 SDValue Result = NewLoad;
28411 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
28412 EVT ExtendVT = ContainerVT.changeVectorElementType(
28413 Load->getMemoryVT().getVectorElementType());
28414
28415 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
28417 Pg, Result, DAG.getUNDEF(ContainerVT));
28418 } else if (VT.isFloatingPoint()) {
28419 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
28420 }
28421
28422 Result = convertFromScalableVector(DAG, VT, Result);
28423 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
28424 return DAG.getMergeValues(MergedValues, DL);
28425}
28426
28428 SelectionDAG &DAG) {
28429 SDLoc DL(Mask);
28430 EVT InVT = Mask.getValueType();
28431 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28432
28433 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
28434
28435 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28436 return Pg;
28437
28438 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
28439 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
28440
28442 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
28443}
28444
28445// Convert all fixed length vector loads larger than NEON to masked_loads.
28446SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
28447 SDValue Op, SelectionDAG &DAG) const {
28448 auto Load = cast<MaskedLoadSDNode>(Op);
28449
28450 SDLoc DL(Op);
28451 EVT VT = Op.getValueType();
28452 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28453
28454 SDValue Mask = Load->getMask();
28455 // If this is an extending load and the mask type is not the same as
28456 // load's type then we have to extend the mask type.
28457 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
28458 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
28459 "Incorrect mask type");
28460 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
28461 }
28463
28464 SDValue PassThru;
28465 bool IsPassThruZeroOrUndef = false;
28466
28467 if (Load->getPassThru()->isUndef()) {
28468 PassThru = DAG.getUNDEF(ContainerVT);
28469 IsPassThruZeroOrUndef = true;
28470 } else {
28471 if (ContainerVT.isInteger())
28472 PassThru = DAG.getConstant(0, DL, ContainerVT);
28473 else
28474 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
28475 if (isZerosVector(Load->getPassThru().getNode()))
28476 IsPassThruZeroOrUndef = true;
28477 }
28478
28479 SDValue NewLoad = DAG.getMaskedLoad(
28480 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
28481 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
28482 Load->getAddressingMode(), Load->getExtensionType());
28483
28484 SDValue Result = NewLoad;
28485 if (!IsPassThruZeroOrUndef) {
28486 SDValue OldPassThru =
28487 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
28488 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
28489 }
28490
28491 Result = convertFromScalableVector(DAG, VT, Result);
28492 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
28493 return DAG.getMergeValues(MergedValues, DL);
28494}
28495
28496// Convert all fixed length vector stores larger than NEON to masked_stores.
28497SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
28498 SDValue Op, SelectionDAG &DAG) const {
28499 auto Store = cast<StoreSDNode>(Op);
28500
28501 SDLoc DL(Op);
28502 EVT VT = Store->getValue().getValueType();
28503 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28504 EVT MemVT = Store->getMemoryVT();
28505
28506 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28507 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
28508
28509 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
28510 EVT TruncVT = ContainerVT.changeVectorElementType(
28511 Store->getMemoryVT().getVectorElementType());
28512 MemVT = MemVT.changeTypeToInteger();
28513 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
28514 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
28515 DAG.getUNDEF(TruncVT));
28516 NewValue =
28517 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
28518 } else if (VT.isFloatingPoint()) {
28519 MemVT = MemVT.changeTypeToInteger();
28520 NewValue =
28521 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
28522 }
28523
28524 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
28525 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
28526 Store->getMemOperand(), Store->getAddressingMode(),
28527 Store->isTruncatingStore());
28528}
28529
28530SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
28531 SDValue Op, SelectionDAG &DAG) const {
28532 auto *Store = cast<MaskedStoreSDNode>(Op);
28533
28534 SDLoc DL(Op);
28535 EVT VT = Store->getValue().getValueType();
28536 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28537
28538 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
28540
28541 return DAG.getMaskedStore(
28542 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
28543 Mask, Store->getMemoryVT(), Store->getMemOperand(),
28544 Store->getAddressingMode(), Store->isTruncatingStore());
28545}
28546
28547SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
28548 SDValue Op, SelectionDAG &DAG) const {
28549 SDLoc dl(Op);
28550 EVT VT = Op.getValueType();
28551 EVT EltVT = VT.getVectorElementType();
28552
28553 bool Signed = Op.getOpcode() == ISD::SDIV;
28554 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
28555
28556 bool Negated;
28557 uint64_t SplatVal;
28558 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
28559 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28560 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
28561 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
28562
28563 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
28564 SDValue Res =
28565 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
28566 if (Negated)
28567 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
28568 DAG.getConstant(0, dl, ContainerVT), Res);
28569
28570 return convertFromScalableVector(DAG, VT, Res);
28571 }
28572
28573 // Scalable vector i32/i64 DIV is supported.
28574 if (EltVT == MVT::i32 || EltVT == MVT::i64)
28575 return LowerToPredicatedOp(Op, DAG, PredOpcode);
28576
28577 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
28578 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
28579 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
28580 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28581
28582 // If the wider type is legal: extend, op, and truncate.
28583 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28584 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
28585 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
28586 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
28587 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
28588 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
28589 }
28590
28591 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
28592 &ExtendOpcode](SDValue Op) {
28593 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
28594 SDValue IdxHalf =
28595 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
28596 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
28597 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
28598 return std::pair<SDValue, SDValue>(
28599 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
28600 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
28601 };
28602
28603 // If wider type is not legal: split, extend, op, trunc and concat.
28604 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
28605 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
28606 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
28607 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
28608 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
28609 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
28610 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
28611}
28612
28613SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
28614 SDValue Op, SelectionDAG &DAG) const {
28615 EVT VT = Op.getValueType();
28616 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28617
28618 SDLoc DL(Op);
28619 SDValue Val = Op.getOperand(0);
28620 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
28621 Val = convertToScalableVector(DAG, ContainerVT, Val);
28622
28623 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
28624 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
28625
28626 // Repeatedly unpack Val until the result is of the desired element type.
28627 switch (ContainerVT.getSimpleVT().SimpleTy) {
28628 default:
28629 llvm_unreachable("unimplemented container type");
28630 case MVT::nxv16i8:
28631 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
28632 if (VT.getVectorElementType() == MVT::i16)
28633 break;
28634 [[fallthrough]];
28635 case MVT::nxv8i16:
28636 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
28637 if (VT.getVectorElementType() == MVT::i32)
28638 break;
28639 [[fallthrough]];
28640 case MVT::nxv4i32:
28641 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
28642 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
28643 break;
28644 }
28645
28646 return convertFromScalableVector(DAG, VT, Val);
28647}
28648
28649SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
28650 SDValue Op, SelectionDAG &DAG) const {
28651 EVT VT = Op.getValueType();
28652 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28653
28654 SDLoc DL(Op);
28655 SDValue Val = Op.getOperand(0);
28656 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
28657 Val = convertToScalableVector(DAG, ContainerVT, Val);
28658
28659 // Repeatedly truncate Val until the result is of the desired element type.
28660 switch (ContainerVT.getSimpleVT().SimpleTy) {
28661 default:
28662 llvm_unreachable("unimplemented container type");
28663 case MVT::nxv2i64:
28664 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
28665 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
28666 if (VT.getVectorElementType() == MVT::i32)
28667 break;
28668 [[fallthrough]];
28669 case MVT::nxv4i32:
28670 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
28671 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
28672 if (VT.getVectorElementType() == MVT::i16)
28673 break;
28674 [[fallthrough]];
28675 case MVT::nxv8i16:
28676 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
28677 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
28678 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
28679 break;
28680 }
28681
28682 return convertFromScalableVector(DAG, VT, Val);
28683}
28684
28685SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
28686 SDValue Op, SelectionDAG &DAG) const {
28687 EVT VT = Op.getValueType();
28688 EVT InVT = Op.getOperand(0).getValueType();
28689 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
28690
28691 SDLoc DL(Op);
28692 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28693 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
28694
28695 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
28696}
28697
28698SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
28699 SDValue Op, SelectionDAG &DAG) const {
28700 EVT VT = Op.getValueType();
28701 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28702
28703 SDLoc DL(Op);
28704 EVT InVT = Op.getOperand(0).getValueType();
28705 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28706 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
28707
28708 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
28709 Op.getOperand(1), Op.getOperand(2));
28710
28711 return convertFromScalableVector(DAG, VT, ScalableRes);
28712}
28713
28714// Convert vector operation 'Op' to an equivalent predicated operation whereby
28715// the original operation's type is used to construct a suitable predicate.
28716// NOTE: The results for inactive lanes are undefined.
28717SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
28718 SelectionDAG &DAG,
28719 unsigned NewOp) const {
28720 EVT VT = Op.getValueType();
28721 SDLoc DL(Op);
28722 auto Pg = getPredicateForVector(DAG, DL, VT);
28723
28724 if (VT.isFixedLengthVector()) {
28725 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
28726 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28727
28728 // Create list of operands by converting existing ones to scalable types.
28730 for (const SDValue &V : Op->op_values()) {
28731 if (isa<CondCodeSDNode>(V)) {
28732 Operands.push_back(V);
28733 continue;
28734 }
28735
28736 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
28737 EVT VTArg = VTNode->getVT().getVectorElementType();
28738 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
28739 Operands.push_back(DAG.getValueType(NewVTArg));
28740 continue;
28741 }
28742
28743 assert(isTypeLegal(V.getValueType()) &&
28744 "Expected only legal fixed-width types");
28745 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
28746 }
28747
28748 if (isMergePassthruOpcode(NewOp))
28749 Operands.push_back(DAG.getUNDEF(ContainerVT));
28750
28751 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
28752 return convertFromScalableVector(DAG, VT, ScalableRes);
28753 }
28754
28755 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
28756
28758 for (const SDValue &V : Op->op_values()) {
28759 assert((!V.getValueType().isVector() ||
28760 V.getValueType().isScalableVector()) &&
28761 "Only scalable vectors are supported!");
28762 Operands.push_back(V);
28763 }
28764
28765 if (isMergePassthruOpcode(NewOp))
28766 Operands.push_back(DAG.getUNDEF(VT));
28767
28768 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
28769}
28770
28771// If a fixed length vector operation has no side effects when applied to
28772// undefined elements, we can safely use scalable vectors to perform the same
28773// operation without needing to worry about predication.
28774SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
28775 SelectionDAG &DAG) const {
28776 EVT VT = Op.getValueType();
28778 "Only expected to lower fixed length vector operation!");
28779 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28780
28781 // Create list of operands by converting existing ones to scalable types.
28783 for (const SDValue &V : Op->op_values()) {
28784 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
28785
28786 // Pass through non-vector operands.
28787 if (!V.getValueType().isVector()) {
28788 Ops.push_back(V);
28789 continue;
28790 }
28791
28792 // "cast" fixed length vector to a scalable vector.
28793 assert(V.getValueType().isFixedLengthVector() &&
28794 isTypeLegal(V.getValueType()) &&
28795 "Only fixed length vectors are supported!");
28796 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
28797 }
28798
28799 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
28800 return convertFromScalableVector(DAG, VT, ScalableRes);
28801}
28802
28803SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
28804 SelectionDAG &DAG) const {
28805 SDLoc DL(ScalarOp);
28806 SDValue AccOp = ScalarOp.getOperand(0);
28807 SDValue VecOp = ScalarOp.getOperand(1);
28808 EVT SrcVT = VecOp.getValueType();
28809 EVT ResVT = SrcVT.getVectorElementType();
28810
28811 EVT ContainerVT = SrcVT;
28812 if (SrcVT.isFixedLengthVector()) {
28813 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
28814 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
28815 }
28816
28817 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
28818 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
28819
28820 // Convert operands to Scalable.
28821 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
28822 DAG.getUNDEF(ContainerVT), AccOp, Zero);
28823
28824 // Perform reduction.
28825 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
28826 Pg, AccOp, VecOp);
28827
28828 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
28829}
28830
28831SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
28832 SelectionDAG &DAG) const {
28833 SDLoc DL(ReduceOp);
28834 SDValue Op = ReduceOp.getOperand(0);
28835 EVT OpVT = Op.getValueType();
28836 EVT VT = ReduceOp.getValueType();
28837
28838 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
28839 return SDValue();
28840
28841 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
28842
28843 switch (ReduceOp.getOpcode()) {
28844 default:
28845 return SDValue();
28846 case ISD::VECREDUCE_OR:
28847 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
28848 // The predicate can be 'Op' because
28849 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
28850 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
28851 else
28852 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
28853 case ISD::VECREDUCE_AND: {
28854 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
28855 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
28856 }
28857 case ISD::VECREDUCE_XOR: {
28858 SDValue ID =
28859 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
28860 if (OpVT == MVT::nxv1i1) {
28861 // Emulate a CNTP on .Q using .D and a different governing predicate.
28862 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
28863 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
28864 }
28865 SDValue Cntp =
28866 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
28867 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
28868 }
28869 }
28870
28871 return SDValue();
28872}
28873
28874SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
28875 SDValue ScalarOp,
28876 SelectionDAG &DAG) const {
28877 SDLoc DL(ScalarOp);
28878 SDValue VecOp = ScalarOp.getOperand(0);
28879 EVT SrcVT = VecOp.getValueType();
28880
28882 SrcVT,
28883 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
28884 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
28885 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
28886 }
28887
28888 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
28889 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
28890 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
28891 SDValue BoolVec = VecOp.getOperand(0);
28892 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
28893 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
28894 SDValue CntpOp = DAG.getNode(
28895 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
28896 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
28897 BoolVec, BoolVec);
28898 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
28899 }
28900 }
28901
28902 // UADDV always returns an i64 result.
28903 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
28904 SrcVT.getVectorElementType();
28905 EVT RdxVT = SrcVT;
28906 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
28907 RdxVT = getPackedSVEVectorVT(ResVT);
28908
28909 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
28910 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
28911 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
28912 Rdx, DAG.getConstant(0, DL, MVT::i64));
28913
28914 // The VEC_REDUCE nodes expect an element size result.
28915 if (ResVT != ScalarOp.getValueType())
28916 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
28917
28918 return Res;
28919}
28920
28921SDValue
28922AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
28923 SelectionDAG &DAG) const {
28924 EVT VT = Op.getValueType();
28925 SDLoc DL(Op);
28926
28927 EVT InVT = Op.getOperand(1).getValueType();
28928 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28929 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
28930 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
28931
28932 // Convert the mask to a predicated (NOTE: We don't need to worry about
28933 // inactive lanes since VSELECT is safe when given undefined elements).
28934 EVT MaskVT = Op.getOperand(0).getValueType();
28935 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
28936 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
28938 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
28939
28940 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
28941 Mask, Op1, Op2);
28942
28943 return convertFromScalableVector(DAG, VT, ScalableRes);
28944}
28945
28946SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
28947 SDValue Op, SelectionDAG &DAG) const {
28948 SDLoc DL(Op);
28949 EVT InVT = Op.getOperand(0).getValueType();
28950 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
28951
28952 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
28953 "Only expected to lower fixed length vector operation!");
28954 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
28955 "Expected integer result of the same bit length as the inputs!");
28956
28957 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
28958 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
28959 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
28960
28961 EVT CmpVT = Pg.getValueType();
28962 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
28963 {Pg, Op1, Op2, Op.getOperand(2)});
28964
28965 EVT PromoteVT = ContainerVT.changeTypeToInteger();
28966 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
28967 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
28968}
28969
28970SDValue
28971AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
28972 SelectionDAG &DAG) const {
28973 SDLoc DL(Op);
28974 auto SrcOp = Op.getOperand(0);
28975 EVT VT = Op.getValueType();
28976 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
28977 EVT ContainerSrcVT =
28978 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
28979
28980 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
28981 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
28982 return convertFromScalableVector(DAG, VT, Op);
28983}
28984
28985SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
28986 SDValue Op, SelectionDAG &DAG) const {
28987 SDLoc DL(Op);
28988 unsigned NumOperands = Op->getNumOperands();
28989
28990 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
28991 "Unexpected number of operands in CONCAT_VECTORS");
28992
28993 auto SrcOp1 = Op.getOperand(0);
28994 auto SrcOp2 = Op.getOperand(1);
28995 EVT VT = Op.getValueType();
28996 EVT SrcVT = SrcOp1.getValueType();
28997
28998 if (NumOperands > 2) {
29000 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
29001 for (unsigned I = 0; I < NumOperands; I += 2)
29002 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
29003 Op->getOperand(I), Op->getOperand(I + 1)));
29004
29005 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
29006 }
29007
29008 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29009
29011 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
29012 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
29013
29014 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
29015
29016 return convertFromScalableVector(DAG, VT, Op);
29017}
29018
29019SDValue
29020AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
29021 SelectionDAG &DAG) const {
29022 EVT VT = Op.getValueType();
29023 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29024
29025 SDLoc DL(Op);
29026 SDValue Val = Op.getOperand(0);
29027 SDValue Pg = getPredicateForVector(DAG, DL, VT);
29028 EVT SrcVT = Val.getValueType();
29029 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29030 EVT ExtendVT = ContainerVT.changeVectorElementType(
29031 SrcVT.getVectorElementType());
29032
29033 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29034 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
29035
29036 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
29037 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
29038 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29039 Pg, Val, DAG.getUNDEF(ContainerVT));
29040
29041 return convertFromScalableVector(DAG, VT, Val);
29042}
29043
29044SDValue
29045AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
29046 SelectionDAG &DAG) const {
29047 EVT VT = Op.getValueType();
29048 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29049
29050 SDLoc DL(Op);
29051 SDValue Val = Op.getOperand(0);
29052 EVT SrcVT = Val.getValueType();
29053 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29054 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
29056 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
29057
29058 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29059 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
29060 Op.getOperand(1), DAG.getUNDEF(RoundVT));
29061 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
29062 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29063
29064 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29065 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29066}
29067
29068SDValue
29069AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
29070 SelectionDAG &DAG) const {
29071 EVT VT = Op.getValueType();
29072 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29073
29074 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
29075 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
29077
29078 SDLoc DL(Op);
29079 SDValue Val = Op.getOperand(0);
29080 EVT SrcVT = Val.getValueType();
29081 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29082 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29083
29084 if (VT.bitsGE(SrcVT)) {
29086
29087 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
29088 VT.changeTypeToInteger(), Val);
29089
29090 // Safe to use a larger than specified operand because by promoting the
29091 // value nothing has changed from an arithmetic point of view.
29092 Val =
29093 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
29094 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29095 DAG.getUNDEF(ContainerDstVT));
29096 return convertFromScalableVector(DAG, VT, Val);
29097 } else {
29098 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
29099 ContainerDstVT.getVectorElementType());
29101
29102 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29103 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29104 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
29105 Val = convertFromScalableVector(DAG, SrcVT, Val);
29106
29107 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
29108 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
29109 }
29110}
29111
29112SDValue
29113AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
29114 SelectionDAG &DAG) const {
29115 SDLoc DL(Op);
29116 EVT OpVT = Op.getValueType();
29117 assert(OpVT.isScalableVector() &&
29118 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
29119 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
29120 Op.getOperand(1));
29121 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
29122 Op.getOperand(1));
29123 return DAG.getMergeValues({Even, Odd}, DL);
29124}
29125
29126SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
29127 SelectionDAG &DAG) const {
29128 SDLoc DL(Op);
29129 EVT OpVT = Op.getValueType();
29130 assert(OpVT.isScalableVector() &&
29131 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
29132
29133 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
29134 Op.getOperand(1));
29135 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
29136 Op.getOperand(1));
29137 return DAG.getMergeValues({Lo, Hi}, DL);
29138}
29139
29140SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
29141 SelectionDAG &DAG) const {
29142 // FIXME: Maybe share some code with LowerMGather/Scatter?
29143 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
29144 SDLoc DL(HG);
29145 SDValue Chain = HG->getChain();
29146 SDValue Inc = HG->getInc();
29147 SDValue Mask = HG->getMask();
29148 SDValue Ptr = HG->getBasePtr();
29149 SDValue Index = HG->getIndex();
29150 SDValue Scale = HG->getScale();
29151 SDValue IntID = HG->getIntID();
29152
29153 // The Intrinsic ID determines the type of update operation.
29154 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
29155 // Right now, we only support 'add' as an update.
29156 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
29157 "Unexpected histogram update operation");
29158
29159 EVT IndexVT = Index.getValueType();
29160 LLVMContext &Ctx = *DAG.getContext();
29162 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
29163 EVT IncExtVT =
29164 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
29165 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
29166 bool ExtTrunc = IncSplatVT != MemVT;
29167
29168 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29169 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
29170 SDValue IncSplat = DAG.getSplatVector(
29171 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
29172 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
29173
29174 MachineMemOperand *MMO = HG->getMemOperand();
29175 // Create an MMO for the gather, without load|store flags.
29178 MMO->getAlign(), MMO->getAAInfo());
29179 ISD::MemIndexType IndexType = HG->getIndexType();
29180 SDValue Gather = DAG.getMaskedGather(
29181 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
29182 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
29183
29184 SDValue GChain = Gather.getValue(1);
29185
29186 // Perform the histcnt, multiply by inc, add to bucket data.
29187 SDValue ID =
29188 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
29189 SDValue HistCnt =
29190 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
29191 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
29192 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
29193
29194 // Create an MMO for the scatter, without load|store flags.
29197 MMO->getAlign(), MMO->getAAInfo());
29198
29199 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
29200 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
29201 ScatterOps, SMMO, IndexType, ExtTrunc);
29202 return Scatter;
29203}
29204
29205SDValue
29206AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
29207 SelectionDAG &DAG) const {
29208 EVT VT = Op.getValueType();
29209 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29210
29211 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
29212 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
29214
29215 SDLoc DL(Op);
29216 SDValue Val = Op.getOperand(0);
29217 EVT SrcVT = Val.getValueType();
29218 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29219 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
29220
29221 if (VT.bitsGT(SrcVT)) {
29222 EVT CvtVT = ContainerDstVT.changeVectorElementType(
29223 ContainerSrcVT.getVectorElementType());
29225
29226 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
29227 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
29228
29229 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
29230 Val = getSVESafeBitCast(CvtVT, Val, DAG);
29231 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
29232 DAG.getUNDEF(ContainerDstVT));
29233 return convertFromScalableVector(DAG, VT, Val);
29234 } else {
29235 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
29237
29238 // Safe to use a larger than specified result since an fp_to_int where the
29239 // result doesn't fit into the destination is undefined.
29240 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
29241 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
29242 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
29243
29244 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
29245 }
29246}
29247
29249 ArrayRef<int> ShuffleMask, EVT VT,
29250 EVT ContainerVT, SelectionDAG &DAG) {
29251 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29252 SDLoc DL(Op);
29253 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29254 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29255 bool IsSingleOp =
29256 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
29257
29258 if (!Subtarget.isNeonAvailable() && !MinSVESize)
29259 MinSVESize = 128;
29260
29261 // Ignore two operands if no SVE2 or all index numbers couldn't
29262 // be represented.
29263 if (!IsSingleOp && !Subtarget.hasSVE2())
29264 return SDValue();
29265
29266 EVT VTOp1 = Op.getOperand(0).getValueType();
29267 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
29268 unsigned IndexLen = MinSVESize / BitsPerElt;
29269 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
29270 uint64_t MaxOffset = maxUIntN(BitsPerElt);
29271 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
29272 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
29273 bool MinMaxEqual = (MinSVESize == MaxSVESize);
29274 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
29275 "Incorrectly legalised shuffle operation");
29276
29278 // If MinSVESize is not equal to MaxSVESize then we need to know which
29279 // TBL mask element needs adjustment.
29280 SmallVector<SDValue, 8> AddRuntimeVLMask;
29281
29282 // Bail out for 8-bits element types, because with 2048-bit SVE register
29283 // size 8 bits is only sufficient to index into the first source vector.
29284 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
29285 return SDValue();
29286
29287 for (int Index : ShuffleMask) {
29288 // Handling poison index value.
29289 if (Index < 0)
29290 Index = 0;
29291 // If the mask refers to elements in the second operand, then we have to
29292 // offset the index by the number of elements in a vector. If this is number
29293 // is not known at compile-time, we need to maintain a mask with 'VL' values
29294 // to add at runtime.
29295 if ((unsigned)Index >= ElementsPerVectorReg) {
29296 if (MinMaxEqual) {
29297 Index += IndexLen - ElementsPerVectorReg;
29298 } else {
29299 Index = Index - ElementsPerVectorReg;
29300 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
29301 }
29302 } else if (!MinMaxEqual)
29303 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
29304 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
29305 // to 255, this might point to the last element of in the second operand
29306 // of the shufflevector, thus we are rejecting this transform.
29307 if ((unsigned)Index >= MaxOffset)
29308 return SDValue();
29309 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
29310 }
29311
29312 // Choosing an out-of-range index leads to the lane being zeroed vs zero
29313 // value where it would perform first lane duplication for out of
29314 // index elements. For i8 elements an out-of-range index could be a valid
29315 // for 2048-bit vector register size.
29316 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
29317 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
29318 if (!MinMaxEqual)
29319 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
29320 }
29321
29322 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
29323 SDValue VecMask =
29324 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
29325 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
29326
29327 SDValue Shuffle;
29328 if (IsSingleOp)
29329 Shuffle =
29330 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
29331 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
29332 Op1, SVEMask);
29333 else if (Subtarget.hasSVE2()) {
29334 if (!MinMaxEqual) {
29335 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
29336 SDValue VScale = (BitsPerElt == 64)
29337 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
29338 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
29339 SDValue VecMask =
29340 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
29341 SDValue MulByMask = DAG.getNode(
29342 ISD::MUL, DL, MaskType,
29343 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
29344 DAG.getBuildVector(MaskType, DL,
29345 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
29346 SDValue UpdatedVecMask =
29347 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
29348 SVEMask = convertToScalableVector(
29349 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
29350 }
29351 Shuffle =
29352 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
29353 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
29354 Op1, Op2, SVEMask);
29355 }
29356 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
29357 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
29358}
29359
29360SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
29361 SDValue Op, SelectionDAG &DAG) const {
29362 EVT VT = Op.getValueType();
29363 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29364
29365 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
29366 auto ShuffleMask = SVN->getMask();
29367
29368 SDLoc DL(Op);
29369 SDValue Op1 = Op.getOperand(0);
29370 SDValue Op2 = Op.getOperand(1);
29371
29372 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29373 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
29374 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
29375
29376 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
29377 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
29378 return MVT::i32;
29379 return ScalarTy;
29380 };
29381
29382 if (SVN->isSplat()) {
29383 unsigned Lane = std::max(0, SVN->getSplatIndex());
29384 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29385 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
29386 DAG.getConstant(Lane, DL, MVT::i64));
29387 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
29388 return convertFromScalableVector(DAG, VT, Op);
29389 }
29390
29391 bool ReverseEXT = false;
29392 unsigned Imm;
29393 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
29394 Imm == VT.getVectorNumElements() - 1) {
29395 if (ReverseEXT)
29396 std::swap(Op1, Op2);
29397 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
29398 SDValue Scalar = DAG.getNode(
29399 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
29400 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
29401 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
29402 return convertFromScalableVector(DAG, VT, Op);
29403 }
29404
29405 unsigned EltSize = VT.getScalarSizeInBits();
29406 for (unsigned LaneSize : {64U, 32U, 16U}) {
29407 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
29408 EVT NewVT =
29410 unsigned RevOp;
29411 if (EltSize == 8)
29413 else if (EltSize == 16)
29415 else
29417
29418 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
29419 Op = LowerToPredicatedOp(Op, DAG, RevOp);
29420 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
29421 return convertFromScalableVector(DAG, VT, Op);
29422 }
29423 }
29424
29425 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
29426 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
29427 if (!VT.isFloatingPoint())
29428 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
29429
29431 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
29432 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
29433 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
29434 return convertFromScalableVector(DAG, VT, Op);
29435 }
29436
29437 unsigned WhichResult;
29438 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
29439 WhichResult == 0)
29441 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
29442
29443 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
29444 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29446 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
29447 }
29448
29449 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
29451 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
29452
29453 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
29454 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29456 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
29457 }
29458
29459 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
29460 // represents the same logical operation as performed by a ZIP instruction. In
29461 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
29462 // equivalent to an AArch64 instruction. There's the extra component of
29463 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
29464 // only operated on 64/128bit vector types that have a direct mapping to a
29465 // target register and so an exact mapping is implied.
29466 // However, when using SVE for fixed length vectors, most legal vector types
29467 // are actually sub-vectors of a larger SVE register. When mapping
29468 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
29469 // how the mask's indices translate. Specifically, when the mapping requires
29470 // an exact meaning for a specific vector index (e.g. Index X is the last
29471 // vector element in the register) then such mappings are often only safe when
29472 // the exact SVE register size is know. The main exception to this is when
29473 // indices are logically relative to the first element of either
29474 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
29475 // when converting from fixed-length to scalable vector types (i.e. the start
29476 // of a fixed length vector is always the start of a scalable vector).
29477 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
29478 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
29479 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
29480 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
29481 Op2.isUndef()) {
29482 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
29483 return convertFromScalableVector(DAG, VT, Op);
29484 }
29485
29486 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
29487 WhichResult != 0)
29489 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
29490
29491 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
29492 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
29494 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
29495 }
29496
29497 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
29499 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
29500
29501 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
29502 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
29504 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
29505 }
29506 }
29507
29508 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
29509 // This may allow the shuffle to be matched as something cheaper like ZIP1.
29510 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
29511 return WideOp;
29512
29513 // Avoid producing TBL instruction if we don't know SVE register minimal size,
29514 // unless NEON is not available and we can assume minimal SVE register size is
29515 // 128-bits.
29516 if (MinSVESize || !Subtarget->isNeonAvailable())
29517 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
29518 DAG);
29519
29520 return SDValue();
29521}
29522
29523SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
29524 SelectionDAG &DAG) const {
29525 SDLoc DL(Op);
29526 EVT InVT = Op.getValueType();
29527
29528 assert(VT.isScalableVector() && isTypeLegal(VT) &&
29529 InVT.isScalableVector() && isTypeLegal(InVT) &&
29530 "Only expect to cast between legal scalable vector types!");
29531 assert(VT.getVectorElementType() != MVT::i1 &&
29532 InVT.getVectorElementType() != MVT::i1 &&
29533 "For predicate bitcasts, use getSVEPredicateBitCast");
29534
29535 if (InVT == VT)
29536 return Op;
29537
29539 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
29540
29541 // Safe bitcasting between unpacked vector types of different element counts
29542 // is currently unsupported because the following is missing the necessary
29543 // work to ensure the result's elements live where they're supposed to within
29544 // an SVE register.
29545 // 01234567
29546 // e.g. nxv2i32 = XX??XX??
29547 // nxv4f16 = X?X?X?X?
29549 VT == PackedVT || InVT == PackedInVT) &&
29550 "Unexpected bitcast!");
29551
29552 // Pack input if required.
29553 if (InVT != PackedInVT)
29554 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
29555
29556 if (Subtarget->isLittleEndian() ||
29557 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
29558 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
29559 else {
29560 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
29561 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
29562
29563 // Simulate the effect of casting through memory.
29564 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
29565 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
29566 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
29567 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
29568 if (PackedVTAsInt.getScalarSizeInBits() != 8)
29569 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
29570 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
29571 }
29572
29573 // Unpack result if required.
29574 if (VT != PackedVT)
29576
29577 return Op;
29578}
29579
29581 SDValue N) const {
29582 return ::isAllActivePredicate(DAG, N);
29583}
29584
29586 return ::getPromotedVTForPredicate(VT);
29587}
29588
29589bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
29590 SDValue Op, const APInt &OriginalDemandedBits,
29591 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
29592 unsigned Depth) const {
29593
29594 unsigned Opc = Op.getOpcode();
29595 switch (Opc) {
29596 case AArch64ISD::VSHL: {
29597 // Match (VSHL (VLSHR Val X) X)
29598 SDValue ShiftL = Op;
29599 SDValue ShiftR = Op->getOperand(0);
29600 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
29601 return false;
29602
29603 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
29604 return false;
29605
29606 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
29607 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
29608
29609 // Other cases can be handled as well, but this is not
29610 // implemented.
29611 if (ShiftRBits != ShiftLBits)
29612 return false;
29613
29614 unsigned ScalarSize = Op.getScalarValueSizeInBits();
29615 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
29616
29617 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
29618 APInt UnusedBits = ~OriginalDemandedBits;
29619
29620 if ((ZeroBits & UnusedBits) != ZeroBits)
29621 return false;
29622
29623 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
29624 // used - simplify to just Val.
29625 return TLO.CombineTo(Op, ShiftR->getOperand(0));
29626 }
29627 case AArch64ISD::BICi: {
29628 // Fold BICi if all destination bits already known to be zeroed
29629 SDValue Op0 = Op.getOperand(0);
29630 KnownBits KnownOp0 =
29631 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
29632 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
29633 APInt BitsToClear =
29634 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
29635 .trunc(KnownOp0.getBitWidth());
29636 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
29637 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
29638 return TLO.CombineTo(Op, Op0);
29639
29640 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
29641 return false;
29642 }
29644 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
29645 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
29646 if (!MaxSVEVectorSizeInBits)
29647 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
29648 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
29649 // The SVE count intrinsics don't support the multiplier immediate so we
29650 // don't have to account for that here. The value returned may be slightly
29651 // over the true required bits, as this is based on the "ALL" pattern. The
29652 // other patterns are also exposed by these intrinsics, but they all
29653 // return a value that's strictly less than "ALL".
29654 unsigned RequiredBits = llvm::bit_width(MaxElements);
29655 unsigned BitWidth = Known.Zero.getBitWidth();
29656 if (RequiredBits < BitWidth)
29657 Known.Zero.setHighBits(BitWidth - RequiredBits);
29658 return false;
29659 }
29660 }
29661 }
29662
29664 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
29665}
29666
29667bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
29668 return Op.getOpcode() == AArch64ISD::DUP ||
29669 Op.getOpcode() == AArch64ISD::MOVI ||
29670 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
29671 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
29673}
29674
29676 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
29677 Subtarget->hasComplxNum();
29678}
29679
29682 auto *VTy = dyn_cast<VectorType>(Ty);
29683 if (!VTy)
29684 return false;
29685
29686 // If the vector is scalable, SVE is enabled, implying support for complex
29687 // numbers. Otherwise, we need to ensure complex number support is available
29688 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
29689 return false;
29690
29691 auto *ScalarTy = VTy->getScalarType();
29692 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
29693
29694 // We can only process vectors that have a bit size of 128 or higher (with an
29695 // additional 64 bits for Neon). Additionally, these vectors must have a
29696 // power-of-2 size, as we later split them into the smallest supported size
29697 // and merging them back together after applying complex operation.
29698 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
29699 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
29700 !llvm::isPowerOf2_32(VTyWidth))
29701 return false;
29702
29703 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
29704 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
29705
29707 return ScalarWidth == 32 || ScalarWidth == 64;
29708 return 8 <= ScalarWidth && ScalarWidth <= 64;
29709 }
29710
29711 // CDot is not supported outside of scalable/sve scopes
29713 return false;
29714
29715 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
29716 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
29717}
29718
29721 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
29722 Value *Accumulator) const {
29723 VectorType *Ty = cast<VectorType>(InputA->getType());
29724 if (Accumulator == nullptr)
29726 bool IsScalable = Ty->isScalableTy();
29727 bool IsInt = Ty->getElementType()->isIntegerTy();
29728
29729 unsigned TyWidth =
29731
29732 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
29733 "Vector type must be either 64 or a power of 2 that is at least 128");
29734
29735 if (TyWidth > 128) {
29736 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
29737 int AccStride = cast<VectorType>(Accumulator->getType())
29738 ->getElementCount()
29739 .getKnownMinValue() /
29740 2;
29741 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
29742 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
29743 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
29744 auto *UpperSplitA =
29745 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
29746 auto *UpperSplitB =
29747 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
29748 Value *LowerSplitAcc = nullptr;
29749 Value *UpperSplitAcc = nullptr;
29750 Type *FullTy = Ty;
29751 FullTy = Accumulator->getType();
29752 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
29753 cast<VectorType>(Accumulator->getType()));
29754 LowerSplitAcc =
29755 B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
29756 UpperSplitAcc =
29757 B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
29758 auto *LowerSplitInt = createComplexDeinterleavingIR(
29759 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
29760 auto *UpperSplitInt = createComplexDeinterleavingIR(
29761 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
29762
29763 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
29764 LowerSplitInt, B.getInt64(0));
29765 return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
29766 B.getInt64(AccStride));
29767 }
29768
29769 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
29770 if (IsScalable) {
29771 if (IsInt)
29772 return B.CreateIntrinsic(
29773 Intrinsic::aarch64_sve_cmla_x, Ty,
29774 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29775
29776 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
29777 return B.CreateIntrinsic(
29778 Intrinsic::aarch64_sve_fcmla, Ty,
29779 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29780 }
29781
29782 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
29783 Intrinsic::aarch64_neon_vcmla_rot90,
29784 Intrinsic::aarch64_neon_vcmla_rot180,
29785 Intrinsic::aarch64_neon_vcmla_rot270};
29786
29787
29788 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
29789 {Accumulator, InputA, InputB});
29790 }
29791
29792 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
29793 if (IsScalable) {
29796 if (IsInt)
29797 return B.CreateIntrinsic(
29798 Intrinsic::aarch64_sve_cadd_x, Ty,
29799 {InputA, InputB, B.getInt32((int)Rotation * 90)});
29800
29801 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
29802 return B.CreateIntrinsic(
29803 Intrinsic::aarch64_sve_fcadd, Ty,
29804 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
29805 }
29806 return nullptr;
29807 }
29808
29811 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
29813 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
29814
29815 if (IntId == Intrinsic::not_intrinsic)
29816 return nullptr;
29817
29818 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
29819 }
29820
29821 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
29822 IsScalable) {
29823 return B.CreateIntrinsic(
29824 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
29825 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
29826 }
29827
29828 return nullptr;
29829}
29830
29831bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
29832 unsigned Opc = N->getOpcode();
29833 if (ISD::isExtOpcode(Opc)) {
29834 if (any_of(N->users(),
29835 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
29836 return false;
29837 }
29838 return true;
29839}
29840
29841unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
29842 return Subtarget->getMinimumJumpTableEntries();
29843}
29844
29847 EVT VT) const {
29848 bool NonUnitFixedLengthVector =
29850 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
29852
29853 EVT VT1;
29854 MVT RegisterVT;
29855 unsigned NumIntermediates;
29856 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
29857 RegisterVT);
29858 return RegisterVT;
29859}
29860
29862 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
29863 bool NonUnitFixedLengthVector =
29865 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
29867
29868 EVT VT1;
29869 MVT VT2;
29870 unsigned NumIntermediates;
29871 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
29872 NumIntermediates, VT2);
29873}
29874
29876 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
29877 unsigned &NumIntermediates, MVT &RegisterVT) const {
29879 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
29880 if (!RegisterVT.isFixedLengthVector() ||
29881 RegisterVT.getFixedSizeInBits() <= 128)
29882 return NumRegs;
29883
29884 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
29885 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
29886 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
29887
29888 // A size mismatch here implies either type promotion or widening and would
29889 // have resulted in scalarisation if larger vectors had not be available.
29890 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
29891 EVT EltTy = VT.getVectorElementType();
29892 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
29893 if (!isTypeLegal(NewVT))
29894 NewVT = EltTy;
29895
29896 IntermediateVT = NewVT;
29897 NumIntermediates = VT.getVectorNumElements();
29898 RegisterVT = getRegisterType(Context, NewVT);
29899 return NumIntermediates;
29900 }
29901
29902 // SVE VLS support does not introduce a new ABI so we should use NEON sized
29903 // types for vector arguments and returns.
29904
29905 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
29906 NumIntermediates *= NumSubRegs;
29907 NumRegs *= NumSubRegs;
29908
29909 switch (RegisterVT.getVectorElementType().SimpleTy) {
29910 default:
29911 llvm_unreachable("unexpected element type for vector");
29912 case MVT::i8:
29913 IntermediateVT = RegisterVT = MVT::v16i8;
29914 break;
29915 case MVT::i16:
29916 IntermediateVT = RegisterVT = MVT::v8i16;
29917 break;
29918 case MVT::i32:
29919 IntermediateVT = RegisterVT = MVT::v4i32;
29920 break;
29921 case MVT::i64:
29922 IntermediateVT = RegisterVT = MVT::v2i64;
29923 break;
29924 case MVT::f16:
29925 IntermediateVT = RegisterVT = MVT::v8f16;
29926 break;
29927 case MVT::f32:
29928 IntermediateVT = RegisterVT = MVT::v4f32;
29929 break;
29930 case MVT::f64:
29931 IntermediateVT = RegisterVT = MVT::v2f64;
29932 break;
29933 case MVT::bf16:
29934 IntermediateVT = RegisterVT = MVT::v8bf16;
29935 break;
29936 }
29937
29938 return NumRegs;
29939}
29940
29942 const MachineFunction &MF) const {
29943 return !Subtarget->isTargetWindows() &&
29944 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
29945}
29946
29948 switch (Opc) {
29952 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
29953 return true;
29954 }
29955
29957}
29958
29959#ifndef NDEBUG
29961 switch (N->getOpcode()) {
29962 default:
29963 break;
29964 case AArch64ISD::SADDWT:
29965 case AArch64ISD::SADDWB:
29966 case AArch64ISD::UADDWT:
29967 case AArch64ISD::UADDWB: {
29968 assert(N->getNumValues() == 1 && "Expected one result!");
29969 assert(N->getNumOperands() == 2 && "Expected two operands!");
29970 EVT VT = N->getValueType(0);
29971 EVT Op0VT = N->getOperand(0).getValueType();
29972 EVT Op1VT = N->getOperand(1).getValueType();
29973 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
29974 VT.isInteger() && Op0VT.isInteger() && Op1VT.isInteger() &&
29975 "Expected integer vectors!");
29976 assert(VT == Op0VT &&
29977 "Expected result and first input to have the same type!");
29978 assert(Op0VT.getSizeInBits() == Op1VT.getSizeInBits() &&
29979 "Expected vectors of equal size!");
29980 assert(Op0VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount() &&
29981 "Expected result vector and first input vector to have half the "
29982 "lanes of the second input vector!");
29983 break;
29984 }
29988 case AArch64ISD::UUNPKHI: {
29989 assert(N->getNumValues() == 1 && "Expected one result!");
29990 assert(N->getNumOperands() == 1 && "Expected one operand!");
29991 EVT VT = N->getValueType(0);
29992 EVT OpVT = N->getOperand(0).getValueType();
29993 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
29994 VT.isInteger() && "Expected integer vectors!");
29995 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
29996 "Expected vectors of equal size!");
29998 "Expected result vector with half the lanes of its input!");
29999 break;
30000 }
30001 case AArch64ISD::TRN1:
30002 case AArch64ISD::TRN2:
30003 case AArch64ISD::UZP1:
30004 case AArch64ISD::UZP2:
30005 case AArch64ISD::ZIP1:
30006 case AArch64ISD::ZIP2: {
30007 assert(N->getNumValues() == 1 && "Expected one result!");
30008 assert(N->getNumOperands() == 2 && "Expected two operands!");
30009 EVT VT = N->getValueType(0);
30010 EVT Op0VT = N->getOperand(0).getValueType();
30011 EVT Op1VT = N->getOperand(1).getValueType();
30012 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
30013 "Expected vectors!");
30014 assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
30015 break;
30016 }
30017 case AArch64ISD::RSHRNB_I: {
30018 assert(N->getNumValues() == 1 && "Expected one result!");
30019 assert(N->getNumOperands() == 2 && "Expected two operands!");
30020 EVT VT = N->getValueType(0);
30021 EVT Op0VT = N->getOperand(0).getValueType();
30022 EVT Op1VT = N->getOperand(1).getValueType();
30023 assert(VT.isVector() && VT.isInteger() &&
30024 "Expected integer vector result type!");
30025 assert(Op0VT.isVector() && Op0VT.isInteger() &&
30026 "Expected first operand to be an integer vector!");
30027 assert(VT.getSizeInBits() == Op0VT.getSizeInBits() &&
30028 "Expected vectors of equal size!");
30030 "Expected input vector with half the lanes of its result!");
30031 assert(Op1VT == MVT::i32 && isa<ConstantSDNode>(N->getOperand(1)) &&
30032 "Expected second operand to be a constant i32!");
30033 break;
30034 }
30035 }
30036}
30037#endif
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
SDValue tryLowerPartialReductionToWideAdd(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parsePredicateRegAsConstraint(StringRef Constraint)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc dl)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
bool shouldUseFormStridedPseudo(MachineInstr &MI)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG)
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static void replaceAllUsesWith(Value *Old, Value *New, SmallSet< BasicBlock *, 32 > &FreshBBs, bool IsHuge)
Replace all old uses with new ones, and push the updated BBs into FreshBBs.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:235
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static Split data
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setSMESaveBufferUsed(bool Used=true)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
std::optional< uint16_t > getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const
Compute the integer discriminator for a given BlockAddress constant, if blockaddress signing is enabl...
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
bool isStreamingSVEAvailable() const
Returns true if the target has access to the streaming-compatible subset of SVE instructions.
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override
Return true if the @llvm.experimental.vector.partial.reduce.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldExpandCmpUsingSelects(EVT VT) const override
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleaveIntrinsicToStore(StoreInst *SI, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI, ArrayRef< Value * > DeinterleaveValues) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1864
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1392
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1007
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1902
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1166
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1909
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1618
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:219
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isMask(unsigned numBits) const
Definition: APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:334
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:959
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
an instruction to allocate memory on the stack
Definition: Instructions.h:63
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ And
*p = old & v
Definition: Instructions.h:724
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
bool isFloatingPointOperation() const
Definition: Instructions.h:882
BinOp getOperation() const
Definition: Instructions.h:805
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:220
const BlockAddress * getBlockAddress() const
The address of a basic block.
Definition: Constants.h:893
Function * getFunction() const
Definition: Constants.h:923
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:197
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space.
Definition: DataLayout.cpp:851
bool isBigEndian() const
Definition: DataLayout.h:198
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:575
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:137
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:707
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:704
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1048
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
arg_iterator arg_end()
Definition: Function.h:877
arg_iterator arg_begin()
Definition: Function.h:868
size_t size() const
Definition: Function.h:858
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:264
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:530
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:130
Type * getValueType() const
Definition: GlobalValue.h:297
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2162
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1072
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1887
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2106
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2555
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2079
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1480
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:495
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2093
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1459
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2033
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2019
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1540
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1499
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
Class to represent integer types.
Definition: DerivedTypes.h:42
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:264
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:200
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
Type * getPointerOperandType() const
Definition: Instructions.h:258
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static constexpr unsigned NoRegister
Definition: MCRegister.h:52
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:712
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
bool hasSharedZAInterface() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:610
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:751
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:761
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:857
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:828
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:505
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:713
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:891
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, SDValue Op2)
Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are its operands and ReducedTY i...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:797
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:874
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:286
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:292
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:470
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:571
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:265
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition: StringRef.h:609
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:684
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:665
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:261
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
@ HalfTyID
16-bit floating point type
Definition: Type.h:56
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition: Type.h:57
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
static IntegerType * getInt16Ty(LLVMContext &C)
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt128Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5304
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:531
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:478
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:496
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
self_iterator getIterator()
Definition: ilist_node.h:132
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:271
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:491
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1417
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1360
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1450
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1226
@ ConstantFP
Definition: ISDOpcodes.h:77
@ STRICT_FATAN2
Definition: ISDOpcodes.h:428
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ STRICT_FCEIL
Definition: ISDOpcodes.h:441
@ STRICT_FTANH
Definition: ISDOpcodes.h:431
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1092
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:451
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1435
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1439
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1096
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1449
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:492
@ STRICT_FLOG2
Definition: ISDOpcodes.h:436
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1338
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1339
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1270
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1494
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:936
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ STRICT_FASIN
Definition: ISDOpcodes.h:425
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:685
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:465
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1432
@ STRICT_FATAN
Definition: ISDOpcodes.h:427
@ WRITE_REGISTER
Definition: ISDOpcodes.h:125
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1292
@ TRUNCATE_SSAT_U
Definition: ISDOpcodes.h:834
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1436
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:788
@ STRICT_LROUND
Definition: ISDOpcodes.h:446
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:601
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:661
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ STRICT_FPOWI
Definition: ISDOpcodes.h:420
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1451
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:642
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1222
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:445
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1444
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1087
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1337
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:90
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1336
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:450
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:439
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:440
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ STRICT_FSINH
Definition: ISDOpcodes.h:429
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1407
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1319
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1286
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ STRICT_LRINT
Definition: ISDOpcodes.h:448
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:606
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ STRICT_FROUND
Definition: ISDOpcodes.h:443
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:464
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1372
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1452
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:442
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:444
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1334
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:458
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:457
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1335
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ STRICT_FCOSH
Definition: ISDOpcodes.h:430
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:680
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:435
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:449
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:627
@ STRICT_FEXP2
Definition: ISDOpcodes.h:433
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1333
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1004
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition: ISDOpcodes.h:669
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:112
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ STRICT_LLROUND
Definition: ISDOpcodes.h:447
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:882
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1481
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:438
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1440
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1217
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:437
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:595
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition: ISDOpcodes.h:832
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:692
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1276
@ TRUNCATE_USAT_U
Definition: ISDOpcodes.h:836
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ STRICT_FACOS
Definition: ISDOpcodes.h:426
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1681
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1572
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1559
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1590
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1561
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:294
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:256
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:360
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:297
bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1547
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:347
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:274
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:261
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2099
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:220
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:315
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:397
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:295
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:354
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:439
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:187
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:320
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:202
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:428
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:153
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:43
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:288
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:303
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:127
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64