LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
275 return true;
276 }
277}
278
279// Returns true if inactive lanes are known to be zeroed by construction.
281 switch (Op.getOpcode()) {
282 default:
283 return false;
284 // We guarantee i1 splat_vectors to zero the other lanes
288 return true;
290 switch (Op.getConstantOperandVal(0)) {
291 default:
292 return false;
293 case Intrinsic::aarch64_sve_ptrue:
294 case Intrinsic::aarch64_sve_pnext:
295 case Intrinsic::aarch64_sve_cmpeq:
296 case Intrinsic::aarch64_sve_cmpne:
297 case Intrinsic::aarch64_sve_cmpge:
298 case Intrinsic::aarch64_sve_cmpgt:
299 case Intrinsic::aarch64_sve_cmphs:
300 case Intrinsic::aarch64_sve_cmphi:
301 case Intrinsic::aarch64_sve_cmpeq_wide:
302 case Intrinsic::aarch64_sve_cmpne_wide:
303 case Intrinsic::aarch64_sve_cmpge_wide:
304 case Intrinsic::aarch64_sve_cmpgt_wide:
305 case Intrinsic::aarch64_sve_cmplt_wide:
306 case Intrinsic::aarch64_sve_cmple_wide:
307 case Intrinsic::aarch64_sve_cmphs_wide:
308 case Intrinsic::aarch64_sve_cmphi_wide:
309 case Intrinsic::aarch64_sve_cmplo_wide:
310 case Intrinsic::aarch64_sve_cmpls_wide:
311 case Intrinsic::aarch64_sve_fcmpeq:
312 case Intrinsic::aarch64_sve_fcmpne:
313 case Intrinsic::aarch64_sve_fcmpge:
314 case Intrinsic::aarch64_sve_fcmpgt:
315 case Intrinsic::aarch64_sve_fcmpuo:
316 case Intrinsic::aarch64_sve_facgt:
317 case Intrinsic::aarch64_sve_facge:
318 case Intrinsic::aarch64_sve_whilege:
319 case Intrinsic::aarch64_sve_whilegt:
320 case Intrinsic::aarch64_sve_whilehi:
321 case Intrinsic::aarch64_sve_whilehs:
322 case Intrinsic::aarch64_sve_whilele:
323 case Intrinsic::aarch64_sve_whilelo:
324 case Intrinsic::aarch64_sve_whilels:
325 case Intrinsic::aarch64_sve_whilelt:
326 case Intrinsic::aarch64_sve_match:
327 case Intrinsic::aarch64_sve_nmatch:
328 case Intrinsic::aarch64_sve_whilege_x2:
329 case Intrinsic::aarch64_sve_whilegt_x2:
330 case Intrinsic::aarch64_sve_whilehi_x2:
331 case Intrinsic::aarch64_sve_whilehs_x2:
332 case Intrinsic::aarch64_sve_whilele_x2:
333 case Intrinsic::aarch64_sve_whilelo_x2:
334 case Intrinsic::aarch64_sve_whilels_x2:
335 case Intrinsic::aarch64_sve_whilelt_x2:
336 return true;
337 }
338 }
339}
340
341static std::tuple<SDValue, SDValue>
343 SDLoc DL(Disc);
344 SDValue AddrDisc;
345 SDValue ConstDisc;
346
347 // If this is a blend, remember the constant and address discriminators.
348 // Otherwise, it's either a constant discriminator, or a non-blended
349 // address discriminator.
350 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
351 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
352 AddrDisc = Disc->getOperand(1);
353 ConstDisc = Disc->getOperand(2);
354 } else {
355 ConstDisc = Disc;
356 }
357
358 // If the constant discriminator (either the blend RHS, or the entire
359 // discriminator value) isn't a 16-bit constant, bail out, and let the
360 // discriminator be computed separately.
361 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
362 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
363 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
364
365 // If there's no address discriminator, use NoRegister, which we'll later
366 // replace with XZR, or directly use a Z variant of the inst. when available.
367 if (!AddrDisc)
368 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
369
370 return std::make_tuple(
371 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
372 AddrDisc);
373}
374
376 const AArch64Subtarget &STI)
377 : TargetLowering(TM), Subtarget(&STI) {
378 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
379 // we have to make something up. Arbitrarily, choose ZeroOrOne.
381 // When comparing vectors the result sets the different elements in the
382 // vector to all-one or all-zero.
384
385 // Set up the register classes.
386 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
387 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
388
389 if (Subtarget->hasLS64()) {
390 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
391 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
393 }
394
395 if (Subtarget->hasFPARMv8()) {
396 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
397 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
399 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
400 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
401 }
402
403 if (Subtarget->hasNEON()) {
404 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
405 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
406
407 addDRType(MVT::v2f32);
408 addDRType(MVT::v8i8);
409 addDRType(MVT::v4i16);
410 addDRType(MVT::v2i32);
411 addDRType(MVT::v1i64);
412 addDRType(MVT::v1f64);
413 addDRType(MVT::v4f16);
414 addDRType(MVT::v4bf16);
415
416 addQRType(MVT::v4f32);
417 addQRType(MVT::v2f64);
418 addQRType(MVT::v16i8);
419 addQRType(MVT::v8i16);
420 addQRType(MVT::v4i32);
421 addQRType(MVT::v2i64);
422 addQRType(MVT::v8f16);
423 addQRType(MVT::v8bf16);
424 }
425
426 if (Subtarget->isSVEorStreamingSVEAvailable()) {
427 // Add legal sve predicate types
428 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
429 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
433
434 // Add legal sve data types
435 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
436 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
439
440 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
441 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
446
447 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
448 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
450
451 if (Subtarget->useSVEForFixedLengthVectors()) {
454 addRegisterClass(VT, &AArch64::ZPRRegClass);
455
458 addRegisterClass(VT, &AArch64::ZPRRegClass);
459 }
460 }
461
462 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
463 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
464 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
465 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
466
467 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
468 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
469 }
470
471 // Compute derived properties from the register classes
473
474 // Provide all sorts of operation actions
513
515
519
523
525
526 // Custom lowering hooks are needed for XOR
527 // to fold it into CSINC/CSINV.
530
531 // Virtually no operation on f128 is legal, but LLVM can't expand them when
532 // there's a valid register class, so we need custom operations in most cases.
557 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
558 // aren't handled.
559
560 // Lowering for many of the conversions is actually specified by the non-f128
561 // type. The LowerXXX function will be trivial when f128 isn't involved.
586 if (Subtarget->hasFPARMv8()) {
589 }
592 if (Subtarget->hasFPARMv8()) {
595 }
598
603
604 // Variable arguments.
609
610 // Variable-sized objects.
613
614 // Lowering Funnel Shifts to EXTR
619
621
622 // Constant pool entries
624
625 // BlockAddress
627
628 // AArch64 lacks both left-rotate and popcount instructions.
634 }
635
636 // AArch64 doesn't have i32 MULH{S|U}.
639
640 // AArch64 doesn't have {U|S}MUL_LOHI.
645
646 if (Subtarget->hasCSSC()) {
650
652
656
659
664
669 } else {
673
676
679 }
680
686 }
693
694 // Custom lower Add/Sub/Mul with overflow.
707
716
725 if (Subtarget->hasFullFP16()) {
728 } else {
731 }
732
733 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
741 setOperationAction(Op, MVT::f16, Promote);
742 setOperationAction(Op, MVT::v4f16, Expand);
743 setOperationAction(Op, MVT::v8f16, Expand);
744 setOperationAction(Op, MVT::bf16, Promote);
745 setOperationAction(Op, MVT::v4bf16, Expand);
746 setOperationAction(Op, MVT::v8bf16, Expand);
747 }
748
749 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
750 for (auto Op : {
754 ISD::FADD,
755 ISD::FSUB,
756 ISD::FMUL,
757 ISD::FDIV,
758 ISD::FMA,
788 })
789 setOperationAction(Op, ScalarVT, Promote);
790
791 for (auto Op : {ISD::FNEG, ISD::FABS})
792 setOperationAction(Op, ScalarVT, Legal);
793
794 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
795 // because the result type is integer.
799 setOperationAction(Op, ScalarVT, Custom);
800
801 // promote v4f16 to v4f32 when that is known to be safe.
802 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
803 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
804 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
805 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
806 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
807 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
808 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
809 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
810 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
811 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
812 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
813 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
814
824
825 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
847 };
848
849 if (!Subtarget->hasFullFP16()) {
850 LegalizeNarrowFP(MVT::f16);
851 }
852 LegalizeNarrowFP(MVT::bf16);
855
856 // AArch64 has implementations of a lot of rounding-like FP operations.
857 for (auto Op :
868 for (MVT Ty : {MVT::f32, MVT::f64})
870 if (Subtarget->hasFullFP16())
871 setOperationAction(Op, MVT::f16, Legal);
872 }
873
874 // Basic strict FP operations are legal
877 for (MVT Ty : {MVT::f32, MVT::f64})
879 if (Subtarget->hasFullFP16())
880 setOperationAction(Op, MVT::f16, Legal);
881 }
882
883 // Strict conversion to a larger type is legal
884 for (auto VT : {MVT::f32, MVT::f64})
886
888
894
896 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
899 } else {
902 }
905
906 // Generate outline atomics library calls only if LSE was not specified for
907 // subtarget
908 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
934#define LCALLNAMES(A, B, N) \
935 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
936 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
937 setLibcallName(A##N##_REL, #B #N "_rel"); \
938 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
939#define LCALLNAME4(A, B) \
940 LCALLNAMES(A, B, 1) \
941 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
942#define LCALLNAME5(A, B) \
943 LCALLNAMES(A, B, 1) \
944 LCALLNAMES(A, B, 2) \
945 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
946 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
947 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
948 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
949 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
950 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
951 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
952#undef LCALLNAMES
953#undef LCALLNAME4
954#undef LCALLNAME5
955 }
956
957 if (Subtarget->hasLSE128()) {
958 // Custom lowering because i128 is not legal. Must be replaced by 2x64
959 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
963 }
964
965 // 128-bit loads and stores can be done without expanding
968
969 // Aligned 128-bit loads and stores are single-copy atomic according to the
970 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
971 if (Subtarget->hasLSE2()) {
974 }
975
976 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
977 // custom lowering, as there are no un-paired non-temporal stores and
978 // legalization will break up 256 bit inputs.
980 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
981 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
982 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
987
988 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
989 // custom lowering, as there are no un-paired non-temporal loads legalization
990 // will break up 256 bit inputs.
991 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
992 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
993 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
994 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
995 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
996 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
997 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
998 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
999
1000 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1002
1003 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1004 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1005 // Issue __sincos_stret if available.
1008 } else {
1011 }
1012
1013 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1014 // MSVCRT doesn't have powi; fall back to pow
1015 setLibcallName(RTLIB::POWI_F32, nullptr);
1016 setLibcallName(RTLIB::POWI_F64, nullptr);
1017 }
1018
1019 // Make floating-point constants legal for the large code model, so they don't
1020 // become loads from the constant pool.
1021 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1024 }
1025
1026 // AArch64 does not have floating-point extending loads, i1 sign-extending
1027 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1028 for (MVT VT : MVT::fp_valuetypes()) {
1029 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1030 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1031 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1033 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1034 }
1035 for (MVT VT : MVT::integer_valuetypes())
1036 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1037
1038 for (MVT WideVT : MVT::fp_valuetypes()) {
1039 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1040 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1041 setTruncStoreAction(WideVT, NarrowVT, Expand);
1042 }
1043 }
1044 }
1045
1046 if (Subtarget->hasFPARMv8()) {
1050 }
1051
1052 // Indexed loads and stores are supported.
1053 for (unsigned im = (unsigned)ISD::PRE_INC;
1055 setIndexedLoadAction(im, MVT::i8, Legal);
1056 setIndexedLoadAction(im, MVT::i16, Legal);
1057 setIndexedLoadAction(im, MVT::i32, Legal);
1058 setIndexedLoadAction(im, MVT::i64, Legal);
1059 setIndexedLoadAction(im, MVT::f64, Legal);
1060 setIndexedLoadAction(im, MVT::f32, Legal);
1061 setIndexedLoadAction(im, MVT::f16, Legal);
1062 setIndexedLoadAction(im, MVT::bf16, Legal);
1063 setIndexedStoreAction(im, MVT::i8, Legal);
1064 setIndexedStoreAction(im, MVT::i16, Legal);
1065 setIndexedStoreAction(im, MVT::i32, Legal);
1066 setIndexedStoreAction(im, MVT::i64, Legal);
1067 setIndexedStoreAction(im, MVT::f64, Legal);
1068 setIndexedStoreAction(im, MVT::f32, Legal);
1069 setIndexedStoreAction(im, MVT::f16, Legal);
1070 setIndexedStoreAction(im, MVT::bf16, Legal);
1071 }
1072
1073 // Trap.
1074 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1077
1078 // We combine OR nodes for bitfield operations.
1080 // Try to create BICs for vector ANDs.
1082
1083 // Vector add and sub nodes may conceal a high-half opportunity.
1084 // Also, try to fold ADD into CSINC/CSINV..
1087
1090
1091 // Try and combine setcc with csel
1093
1095
1102
1104
1106
1108
1112
1114
1116
1118
1120
1124
1126
1127 // In case of strict alignment, avoid an excessive number of byte wide stores.
1130 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1131
1135 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1136
1139
1142 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1143
1145
1147
1148 EnableExtLdPromotion = true;
1149
1150 // Set required alignment.
1152 // Set preferred alignments.
1153
1154 // Don't align loops on Windows. The SEH unwind info generation needs to
1155 // know the exact length of functions before the alignments have been
1156 // expanded.
1157 if (!Subtarget->isTargetWindows())
1161
1162 // Only change the limit for entries in a jump table if specified by
1163 // the sub target, but not at the command line.
1164 unsigned MaxJT = STI.getMaximumJumpTableSize();
1165 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1167
1169
1171
1173
1174 if (Subtarget->isNeonAvailable()) {
1175 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1176 // silliness like this:
1177 // clang-format off
1178 for (auto Op :
1196 setOperationAction(Op, MVT::v1f64, Expand);
1197 // clang-format on
1198 for (auto Op :
1203 setOperationAction(Op, MVT::v1i64, Expand);
1204
1205 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1206 // elements smaller than i32, so promote the input to i32 first.
1207 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1208 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1209
1210 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1211 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1212 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1215 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1217
1218 if (Subtarget->hasFullFP16()) {
1221
1230 } else {
1231 // when AArch64 doesn't have fullfp16 support, promote the input
1232 // to i32 first.
1233 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1234 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1235 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1236 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1237 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1238 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1239 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1240 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1241 }
1242
1243 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1244 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1251 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1256 }
1257
1258 // Custom handling for some quad-vector types to detect MULL.
1259 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1260 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1261 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1262 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1263 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1264 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1265
1266 // Saturates
1267 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1268 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1273 }
1274
1275 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1276 MVT::v4i32}) {
1283 }
1284
1285 // Vector reductions
1286 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1287 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1288 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1293
1295 }
1296 }
1297 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1298 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1307 }
1312
1314 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1315 // Likewise, narrowing and extending vector loads/stores aren't handled
1316 // directly.
1319
1320 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1323 } else {
1326 }
1329
1332
1333 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1334 setTruncStoreAction(VT, InnerVT, Expand);
1335 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1336 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1337 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1338 }
1339 }
1340
1341 // AArch64 has implementations of a lot of rounding-like FP operations.
1342 for (auto Op :
1347 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1349 if (Subtarget->hasFullFP16())
1350 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1352 }
1353
1354 // LRINT and LLRINT.
1355 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1356 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1358 if (Subtarget->hasFullFP16())
1359 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1361 }
1362
1363 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1364
1369
1373
1374 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1375 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1376 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1377 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1378 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1379 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1380
1381 // ADDP custom lowering
1382 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1384 // FADDP custom lowering
1385 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1387 } else /* !isNeonAvailable */ {
1389 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1391
1392 if (VT.is128BitVector() || VT.is64BitVector()) {
1396 Subtarget->isLittleEndian() ? Legal : Expand);
1397 }
1398 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1399 setTruncStoreAction(VT, InnerVT, Expand);
1400 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1401 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1403 }
1404 }
1405 }
1406
1407 if (Subtarget->hasSME()) {
1409 }
1410
1411 // FIXME: Move lowering for more nodes here if those are common between
1412 // SVE and SME.
1413 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1414 for (auto VT :
1415 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1420 }
1421 }
1422
1423 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1424 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1465
1471
1480
1485
1486 if (!Subtarget->isLittleEndian())
1488
1489 if (Subtarget->hasSVE2() ||
1490 (Subtarget->hasSME() && Subtarget->isStreaming()))
1491 // For SLI/SRI.
1493 }
1494
1495 // Illegal unpacked integer vector types.
1496 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1499 }
1500
1501 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1502 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1503 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1505
1506 for (auto VT :
1507 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1508 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1510
1511 for (auto VT :
1512 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1520
1524
1525 // There are no legal MVT::nxv16f## based types.
1526 if (VT != MVT::nxv16i1) {
1529 }
1530 }
1531
1532 // NEON doesn't support masked loads/stores, but SME and SVE do.
1533 for (auto VT :
1534 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1535 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1536 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1539 }
1540
1541 // Firstly, exclude all scalable vector extending loads/truncating stores,
1542 // include both integer and floating scalable vector.
1544 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1545 setTruncStoreAction(VT, InnerVT, Expand);
1546 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1547 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1548 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1549 }
1550 }
1551
1552 // Then, selectively enable those which we directly support.
1553 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1554 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1555 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1556 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1557 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1558 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1559 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1560 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1561 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1562 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1563 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1564 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1565 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1566 }
1567
1568 // SVE supports truncating stores of 64 and 128-bit vectors
1569 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1570 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1571 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1572 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1573 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1574
1575 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1576 MVT::nxv4f32, MVT::nxv2f64}) {
1615
1630
1642
1643 if (!Subtarget->isLittleEndian())
1645 }
1646
1647 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1653
1654 if (!Subtarget->isLittleEndian())
1656 }
1657
1660
1661 // NEON doesn't support integer divides, but SVE does
1662 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1663 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1666 }
1667
1668 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1669 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1670 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1671
1672 // NOTE: Currently this has to happen after computeRegisterProperties rather
1673 // than the preferred option of combining it with the addRegisterClass call.
1674 if (Subtarget->useSVEForFixedLengthVectors()) {
1677 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1678 addTypeForFixedLengthSVE(VT);
1679 }
1682 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1683 addTypeForFixedLengthSVE(VT);
1684 }
1685
1686 // 64bit results can mean a bigger than NEON input.
1687 for (auto VT : {MVT::v8i8, MVT::v4i16})
1690
1691 // 128bit results imply a bigger than NEON input.
1692 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1694 for (auto VT : {MVT::v8f16, MVT::v4f32})
1696
1697 // These operations are not supported on NEON but SVE can do them.
1699 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1700 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1701 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1702 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1703 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1704 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1705 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1706 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1707 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1708 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1709 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1710 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1711 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1712 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1713 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1718
1719 // Int operations with no NEON support.
1720 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1721 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1729 }
1730
1731 // Use SVE for vectors with more than 2 elements.
1732 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1734 }
1735
1736 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1737 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1738 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1739 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1740
1742
1743 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1745 }
1746
1747 // Handle operations that are only available in non-streaming SVE mode.
1748 if (Subtarget->isSVEAvailable()) {
1749 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1750 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1751 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1752 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1753 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1754 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1755 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1758 }
1759
1760 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1761 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1762 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1764
1765 // Histcnt is SVE2 only
1766 if (Subtarget->hasSVE2())
1768 Custom);
1769 }
1770
1771
1772 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1773 // Only required for llvm.aarch64.mops.memset.tag
1775 }
1776
1778
1779 if (Subtarget->hasSVE()) {
1784 }
1785
1786 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1787
1788 IsStrictFPEnabled = true;
1790
1791 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1792 // it, but it's just a wrapper around ldexp.
1793 if (Subtarget->isTargetWindows()) {
1795 if (isOperationExpand(Op, MVT::f32))
1796 setOperationAction(Op, MVT::f32, Promote);
1797 }
1798
1799 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1800 // isn't legal.
1802 if (isOperationExpand(Op, MVT::f16))
1803 setOperationAction(Op, MVT::f16, Promote);
1804
1805 if (Subtarget->isWindowsArm64EC()) {
1806 // FIXME: are there intrinsics we need to exclude from this?
1807 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1808 auto code = static_cast<RTLIB::Libcall>(i);
1809 auto libcallName = getLibcallName(code);
1810 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1811 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1812 }
1813 }
1814 }
1815}
1816
1817void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1818 assert(VT.isVector() && "VT should be a vector type");
1819
1820 if (VT.isFloatingPoint()) {
1822 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1823 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1824 }
1825
1826 // Mark vector float intrinsics as expand.
1827 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1838 }
1839
1840 // But we do support custom-lowering for FCOPYSIGN.
1841 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1842 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1843 VT == MVT::v8f16) &&
1844 Subtarget->hasFullFP16()))
1846
1859
1863 for (MVT InnerVT : MVT::all_valuetypes())
1864 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1865
1866 // CNT supports only B element sizes, then use UADDLP to widen.
1867 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1869
1875
1876 for (unsigned Opcode :
1879 setOperationAction(Opcode, VT, Custom);
1880
1881 if (!VT.isFloatingPoint())
1883
1884 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1885 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1886 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1887 setOperationAction(Opcode, VT, Legal);
1888
1889 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1890 // NEON types.
1891 if (VT.isFloatingPoint() &&
1892 VT.getVectorElementType() != MVT::bf16 &&
1893 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1894 for (unsigned Opcode :
1900 setOperationAction(Opcode, VT, Legal);
1901
1902 // Strict fp extend and trunc are legal
1903 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1905 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1907
1908 // FIXME: We could potentially make use of the vector comparison instructions
1909 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1910 // complications:
1911 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1912 // so we would need to expand when the condition code doesn't match the
1913 // kind of comparison.
1914 // * Some kinds of comparison require more than one FCMXY instruction so
1915 // would need to be expanded instead.
1916 // * The lowering of the non-strict versions involves target-specific ISD
1917 // nodes so we would likely need to add strict versions of all of them and
1918 // handle them appropriately.
1921
1922 if (Subtarget->isLittleEndian()) {
1923 for (unsigned im = (unsigned)ISD::PRE_INC;
1927 }
1928 }
1929
1930 if (Subtarget->hasD128()) {
1933 }
1934}
1935
1937 EVT OpVT) const {
1938 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1939 if (!Subtarget->hasSVE())
1940 return true;
1941
1942 // We can only support legal predicate result types. We can use the SVE
1943 // whilelo instruction for generating fixed-width predicates too.
1944 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1945 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1946 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1947 return true;
1948
1949 // The whilelo instruction only works with i32 or i64 scalar inputs.
1950 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1951 return true;
1952
1953 return false;
1954}
1955
1957 if (!Subtarget->isSVEorStreamingSVEAvailable())
1958 return true;
1959
1960 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
1961 // also support fixed-width predicates.
1962 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1963 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1964 VT != MVT::v4i1 && VT != MVT::v2i1;
1965}
1966
1967void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1968 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1969
1970 // By default everything must be expanded.
1971 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1973
1974 if (VT.isFloatingPoint()) {
1984 }
1985
1987 VT == MVT::v1f64 ? Expand : Custom;
1988
1989 // Mark integer truncating stores/extending loads as having custom lowering
1990 if (VT.isInteger()) {
1991 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1992 while (InnerVT != VT) {
1993 setTruncStoreAction(VT, InnerVT, Default);
1994 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1995 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1996 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1997 InnerVT = InnerVT.changeVectorElementType(
1998 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1999 }
2000 }
2001
2002 // Mark floating-point truncating stores/extending loads as having custom
2003 // lowering
2004 if (VT.isFloatingPoint()) {
2005 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2006 while (InnerVT != VT) {
2007 setTruncStoreAction(VT, InnerVT, Custom);
2008 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2009 InnerVT = InnerVT.changeVectorElementType(
2011 }
2012 }
2013
2014 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2015 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2016
2017 // Lower fixed length vector operations to scalable equivalents.
2022 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2059 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2060 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2062 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2081 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2107}
2108
2109void AArch64TargetLowering::addDRType(MVT VT) {
2110 addRegisterClass(VT, &AArch64::FPR64RegClass);
2111 if (Subtarget->isNeonAvailable())
2112 addTypeForNEON(VT);
2113}
2114
2115void AArch64TargetLowering::addQRType(MVT VT) {
2116 addRegisterClass(VT, &AArch64::FPR128RegClass);
2117 if (Subtarget->isNeonAvailable())
2118 addTypeForNEON(VT);
2119}
2120
2122 LLVMContext &C, EVT VT) const {
2123 if (!VT.isVector())
2124 return MVT::i32;
2125 if (VT.isScalableVector())
2126 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2128}
2129
2130// isIntImmediate - This method tests to see if the node is a constant
2131// operand. If so Imm will receive the value.
2132static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2133 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2134 Imm = C->getZExtValue();
2135 return true;
2136 }
2137 return false;
2138}
2139
2140// isOpcWithIntImmediate - This method tests to see if the node is a specific
2141// opcode and that it has a immediate integer right operand.
2142// If so Imm will receive the value.
2143static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2144 uint64_t &Imm) {
2145 return N->getOpcode() == Opc &&
2146 isIntImmediate(N->getOperand(1).getNode(), Imm);
2147}
2148
2149static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2150 const APInt &Demanded,
2152 unsigned NewOpc) {
2153 uint64_t OldImm = Imm, NewImm, Enc;
2154 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2155
2156 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2157 // bimm64.
2158 if (Imm == 0 || Imm == Mask ||
2160 return false;
2161
2162 unsigned EltSize = Size;
2163 uint64_t DemandedBits = Demanded.getZExtValue();
2164
2165 // Clear bits that are not demanded.
2166 Imm &= DemandedBits;
2167
2168 while (true) {
2169 // The goal here is to set the non-demanded bits in a way that minimizes
2170 // the number of switching between 0 and 1. In order to achieve this goal,
2171 // we set the non-demanded bits to the value of the preceding demanded bits.
2172 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2173 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2174 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2175 // The final result is 0b11000011.
2176 uint64_t NonDemandedBits = ~DemandedBits;
2177 uint64_t InvertedImm = ~Imm & DemandedBits;
2178 uint64_t RotatedImm =
2179 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2180 NonDemandedBits;
2181 uint64_t Sum = RotatedImm + NonDemandedBits;
2182 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2183 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2184 NewImm = (Imm | Ones) & Mask;
2185
2186 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2187 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2188 // we halve the element size and continue the search.
2189 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2190 break;
2191
2192 // We cannot shrink the element size any further if it is 2-bits.
2193 if (EltSize == 2)
2194 return false;
2195
2196 EltSize /= 2;
2197 Mask >>= EltSize;
2198 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2199
2200 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2201 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2202 return false;
2203
2204 // Merge the upper and lower halves of Imm and DemandedBits.
2205 Imm |= Hi;
2206 DemandedBits |= DemandedBitsHi;
2207 }
2208
2209 ++NumOptimizedImms;
2210
2211 // Replicate the element across the register width.
2212 while (EltSize < Size) {
2213 NewImm |= NewImm << EltSize;
2214 EltSize *= 2;
2215 }
2216
2217 (void)OldImm;
2218 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2219 "demanded bits should never be altered");
2220 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2221
2222 // Create the new constant immediate node.
2223 EVT VT = Op.getValueType();
2224 SDLoc DL(Op);
2225 SDValue New;
2226
2227 // If the new constant immediate is all-zeros or all-ones, let the target
2228 // independent DAG combine optimize this node.
2229 if (NewImm == 0 || NewImm == OrigMask) {
2230 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2231 TLO.DAG.getConstant(NewImm, DL, VT));
2232 // Otherwise, create a machine node so that target independent DAG combine
2233 // doesn't undo this optimization.
2234 } else {
2236 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2237 New = SDValue(
2238 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2239 }
2240
2241 return TLO.CombineTo(Op, New);
2242}
2243
2245 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2246 TargetLoweringOpt &TLO) const {
2247 // Delay this optimization to as late as possible.
2248 if (!TLO.LegalOps)
2249 return false;
2250
2252 return false;
2253
2254 EVT VT = Op.getValueType();
2255 if (VT.isVector())
2256 return false;
2257
2258 unsigned Size = VT.getSizeInBits();
2259 assert((Size == 32 || Size == 64) &&
2260 "i32 or i64 is expected after legalization.");
2261
2262 // Exit early if we demand all bits.
2263 if (DemandedBits.popcount() == Size)
2264 return false;
2265
2266 unsigned NewOpc;
2267 switch (Op.getOpcode()) {
2268 default:
2269 return false;
2270 case ISD::AND:
2271 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2272 break;
2273 case ISD::OR:
2274 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2275 break;
2276 case ISD::XOR:
2277 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2278 break;
2279 }
2280 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2281 if (!C)
2282 return false;
2283 uint64_t Imm = C->getZExtValue();
2284 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2285}
2286
2287/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2288/// Mask are known to be either zero or one and return them Known.
2290 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2291 const SelectionDAG &DAG, unsigned Depth) const {
2292 switch (Op.getOpcode()) {
2293 default:
2294 break;
2295 case AArch64ISD::DUP: {
2296 SDValue SrcOp = Op.getOperand(0);
2297 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2298 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2299 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2300 "Expected DUP implicit truncation");
2301 Known = Known.trunc(Op.getScalarValueSizeInBits());
2302 }
2303 break;
2304 }
2305 case AArch64ISD::CSEL: {
2306 KnownBits Known2;
2307 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2308 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2309 Known = Known.intersectWith(Known2);
2310 break;
2311 }
2312 case AArch64ISD::BICi: {
2313 // Compute the bit cleared value.
2314 uint64_t Mask =
2315 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2316 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2317 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2318 break;
2319 }
2320 case AArch64ISD::VLSHR: {
2321 KnownBits Known2;
2322 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2323 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2324 Known = KnownBits::lshr(Known, Known2);
2325 break;
2326 }
2327 case AArch64ISD::VASHR: {
2328 KnownBits Known2;
2329 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2330 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2331 Known = KnownBits::ashr(Known, Known2);
2332 break;
2333 }
2334 case AArch64ISD::VSHL: {
2335 KnownBits Known2;
2336 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2337 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2338 Known = KnownBits::shl(Known, Known2);
2339 break;
2340 }
2341 case AArch64ISD::MOVI: {
2343 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2344 break;
2345 }
2347 case AArch64ISD::ADDlow: {
2348 if (!Subtarget->isTargetILP32())
2349 break;
2350 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2351 Known.Zero = APInt::getHighBitsSet(64, 32);
2352 break;
2353 }
2355 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2356 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2357 break;
2358 }
2360 Intrinsic::ID IntID =
2361 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2362 switch (IntID) {
2363 default: return;
2364 case Intrinsic::aarch64_ldaxr:
2365 case Intrinsic::aarch64_ldxr: {
2366 unsigned BitWidth = Known.getBitWidth();
2367 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2368 unsigned MemBits = VT.getScalarSizeInBits();
2369 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2370 return;
2371 }
2372 }
2373 break;
2374 }
2376 case ISD::INTRINSIC_VOID: {
2377 unsigned IntNo = Op.getConstantOperandVal(0);
2378 switch (IntNo) {
2379 default:
2380 break;
2381 case Intrinsic::aarch64_neon_uaddlv: {
2382 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2383 unsigned BitWidth = Known.getBitWidth();
2384 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2385 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2386 assert(BitWidth >= Bound && "Unexpected width!");
2388 Known.Zero |= Mask;
2389 }
2390 break;
2391 }
2392 case Intrinsic::aarch64_neon_umaxv:
2393 case Intrinsic::aarch64_neon_uminv: {
2394 // Figure out the datatype of the vector operand. The UMINV instruction
2395 // will zero extend the result, so we can mark as known zero all the
2396 // bits larger than the element datatype. 32-bit or larget doesn't need
2397 // this as those are legal types and will be handled by isel directly.
2398 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2399 unsigned BitWidth = Known.getBitWidth();
2400 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2401 assert(BitWidth >= 8 && "Unexpected width!");
2403 Known.Zero |= Mask;
2404 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2405 assert(BitWidth >= 16 && "Unexpected width!");
2407 Known.Zero |= Mask;
2408 }
2409 break;
2410 } break;
2411 }
2412 }
2413 }
2414}
2415
2417 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2418 unsigned Depth) const {
2419 EVT VT = Op.getValueType();
2420 unsigned VTBits = VT.getScalarSizeInBits();
2421 unsigned Opcode = Op.getOpcode();
2422 switch (Opcode) {
2423 case AArch64ISD::CMEQ:
2424 case AArch64ISD::CMGE:
2425 case AArch64ISD::CMGT:
2426 case AArch64ISD::CMHI:
2427 case AArch64ISD::CMHS:
2428 case AArch64ISD::FCMEQ:
2429 case AArch64ISD::FCMGE:
2430 case AArch64ISD::FCMGT:
2431 case AArch64ISD::CMEQz:
2432 case AArch64ISD::CMGEz:
2433 case AArch64ISD::CMGTz:
2434 case AArch64ISD::CMLEz:
2435 case AArch64ISD::CMLTz:
2436 case AArch64ISD::FCMEQz:
2437 case AArch64ISD::FCMGEz:
2438 case AArch64ISD::FCMGTz:
2439 case AArch64ISD::FCMLEz:
2440 case AArch64ISD::FCMLTz:
2441 // Compares return either 0 or all-ones
2442 return VTBits;
2443 }
2444
2445 return 1;
2446}
2447
2449 EVT) const {
2450 return MVT::i64;
2451}
2452
2454 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2455 unsigned *Fast) const {
2456 if (Subtarget->requiresStrictAlign())
2457 return false;
2458
2459 if (Fast) {
2460 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2461 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2462 // See comments in performSTORECombine() for more details about
2463 // these conditions.
2464
2465 // Code that uses clang vector extensions can mark that it
2466 // wants unaligned accesses to be treated as fast by
2467 // underspecifying alignment to be 1 or 2.
2468 Alignment <= 2 ||
2469
2470 // Disregard v2i64. Memcpy lowering produces those and splitting
2471 // them regresses performance on micro-benchmarks and olden/bh.
2472 VT == MVT::v2i64;
2473 }
2474 return true;
2475}
2476
2477// Same as above but handling LLTs instead.
2479 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2480 unsigned *Fast) const {
2481 if (Subtarget->requiresStrictAlign())
2482 return false;
2483
2484 if (Fast) {
2485 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2486 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2487 Ty.getSizeInBytes() != 16 ||
2488 // See comments in performSTORECombine() for more details about
2489 // these conditions.
2490
2491 // Code that uses clang vector extensions can mark that it
2492 // wants unaligned accesses to be treated as fast by
2493 // underspecifying alignment to be 1 or 2.
2494 Alignment <= 2 ||
2495
2496 // Disregard v2i64. Memcpy lowering produces those and splitting
2497 // them regresses performance on micro-benchmarks and olden/bh.
2498 Ty == LLT::fixed_vector(2, 64);
2499 }
2500 return true;
2501}
2502
2503FastISel *
2505 const TargetLibraryInfo *libInfo) const {
2506 return AArch64::createFastISel(funcInfo, libInfo);
2507}
2508
2509const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2510#define MAKE_CASE(V) \
2511 case V: \
2512 return #V;
2513 switch ((AArch64ISD::NodeType)Opcode) {
2515 break;
2839 }
2840#undef MAKE_CASE
2841 return nullptr;
2842}
2843
2846 MachineBasicBlock *MBB) const {
2847 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2848 // phi node:
2849
2850 // OrigBB:
2851 // [... previous instrs leading to comparison ...]
2852 // b.ne TrueBB
2853 // b EndBB
2854 // TrueBB:
2855 // ; Fallthrough
2856 // EndBB:
2857 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2858
2859 MachineFunction *MF = MBB->getParent();
2860 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2861 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2862 DebugLoc DL = MI.getDebugLoc();
2864
2865 Register DestReg = MI.getOperand(0).getReg();
2866 Register IfTrueReg = MI.getOperand(1).getReg();
2867 Register IfFalseReg = MI.getOperand(2).getReg();
2868 unsigned CondCode = MI.getOperand(3).getImm();
2869 bool NZCVKilled = MI.getOperand(4).isKill();
2870
2871 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2872 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2873 MF->insert(It, TrueBB);
2874 MF->insert(It, EndBB);
2875
2876 // Transfer rest of current basic-block to EndBB
2877 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2878 MBB->end());
2880
2881 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2882 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2883 MBB->addSuccessor(TrueBB);
2884 MBB->addSuccessor(EndBB);
2885
2886 // TrueBB falls through to the end.
2887 TrueBB->addSuccessor(EndBB);
2888
2889 if (!NZCVKilled) {
2890 TrueBB->addLiveIn(AArch64::NZCV);
2891 EndBB->addLiveIn(AArch64::NZCV);
2892 }
2893
2894 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2895 .addReg(IfTrueReg)
2896 .addMBB(TrueBB)
2897 .addReg(IfFalseReg)
2898 .addMBB(MBB);
2899
2900 MI.eraseFromParent();
2901 return EndBB;
2902}
2903
2905 MachineInstr &MI, MachineBasicBlock *BB) const {
2907 BB->getParent()->getFunction().getPersonalityFn())) &&
2908 "SEH does not use catchret!");
2909 return BB;
2910}
2911
2914 MachineBasicBlock *MBB) const {
2915 MachineFunction &MF = *MBB->getParent();
2916 MachineBasicBlock::iterator MBBI = MI.getIterator();
2918 const AArch64InstrInfo &TII =
2919 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2920 Register TargetReg = MI.getOperand(0).getReg();
2922 TII.probedStackAlloc(MBBI, TargetReg, false);
2923
2924 MI.eraseFromParent();
2925 return NextInst->getParent();
2926}
2927
2929AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2931 MachineBasicBlock *BB) const {
2932 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2933 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2934
2935 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2936 MIB.add(MI.getOperand(1)); // slice index register
2937 MIB.add(MI.getOperand(2)); // slice index offset
2938 MIB.add(MI.getOperand(3)); // pg
2939 MIB.add(MI.getOperand(4)); // base
2940 MIB.add(MI.getOperand(5)); // offset
2941
2942 MI.eraseFromParent(); // The pseudo is gone now.
2943 return BB;
2944}
2945
2948 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2950 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2951
2952 MIB.addReg(AArch64::ZA, RegState::Define);
2953 MIB.add(MI.getOperand(0)); // Vector select register
2954 MIB.add(MI.getOperand(1)); // Vector select offset
2955 MIB.add(MI.getOperand(2)); // Base
2956 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2957
2958 MI.eraseFromParent(); // The pseudo is gone now.
2959 return BB;
2960}
2961
2964 unsigned Opcode,
2965 bool Op0IsDef) const {
2966 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2968
2969 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2970 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2971 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2972 MIB.add(MI.getOperand(I));
2973
2974 MI.eraseFromParent(); // The pseudo is gone now.
2975 return BB;
2976}
2977
2979AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2981 MachineBasicBlock *BB) const {
2982 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2983 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2984 unsigned StartIdx = 0;
2985
2986 bool HasTile = BaseReg != AArch64::ZA;
2987 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
2988 if (HasZPROut) {
2989 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
2990 ++StartIdx;
2991 }
2992 if (HasTile) {
2993 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
2994 RegState::Define); // Output ZA Tile
2995 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
2996 StartIdx++;
2997 } else {
2998 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
2999 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3000 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3001 ++StartIdx;
3002 }
3003 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3004 }
3005 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3006 MIB.add(MI.getOperand(I));
3007
3008 MI.eraseFromParent(); // The pseudo is gone now.
3009 return BB;
3010}
3011
3014 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3016 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3017 MIB.add(MI.getOperand(0)); // Mask
3018
3019 unsigned Mask = MI.getOperand(0).getImm();
3020 for (unsigned I = 0; I < 8; I++) {
3021 if (Mask & (1 << I))
3022 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3023 }
3024
3025 MI.eraseFromParent(); // The pseudo is gone now.
3026 return BB;
3027}
3028
3031 MachineBasicBlock *BB) const {
3032 MachineFunction *MF = BB->getParent();
3033 MachineFrameInfo &MFI = MF->getFrameInfo();
3035 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3036 if (TPIDR2.Uses > 0) {
3037 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3038 // Store the buffer pointer to the TPIDR2 stack object.
3039 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3040 .addReg(MI.getOperand(0).getReg())
3041 .addFrameIndex(TPIDR2.FrameIndex)
3042 .addImm(0);
3043 // Set the reserved bytes (10-15) to zero
3044 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3045 .addReg(AArch64::WZR)
3046 .addFrameIndex(TPIDR2.FrameIndex)
3047 .addImm(5);
3048 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3049 .addReg(AArch64::WZR)
3050 .addFrameIndex(TPIDR2.FrameIndex)
3051 .addImm(3);
3052 } else
3053 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3054
3055 BB->remove_instr(&MI);
3056 return BB;
3057}
3058
3061 MachineBasicBlock *BB) const {
3062 MachineFunction *MF = BB->getParent();
3063 MachineFrameInfo &MFI = MF->getFrameInfo();
3065 // TODO This function grows the stack with a subtraction, which doesn't work
3066 // on Windows. Some refactoring to share the functionality in
3067 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3068 // supports SME
3070 "Lazy ZA save is not yet supported on Windows");
3071
3072 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3073
3074 if (TPIDR2.Uses > 0) {
3075 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3077
3078 // The SUBXrs below won't always be emitted in a form that accepts SP
3079 // directly
3080 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3081 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3082 .addReg(AArch64::SP);
3083
3084 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3085 auto Size = MI.getOperand(1).getReg();
3086 auto Dest = MI.getOperand(0).getReg();
3087 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3088 .addReg(Size)
3089 .addReg(Size)
3090 .addReg(SP);
3091 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3092 AArch64::SP)
3093 .addReg(Dest);
3094
3095 // We have just allocated a variable sized object, tell this to PEI.
3096 MFI.CreateVariableSizedObject(Align(16), nullptr);
3097 }
3098
3099 BB->remove_instr(&MI);
3100 return BB;
3101}
3102
3104 MachineInstr &MI, MachineBasicBlock *BB) const {
3105
3106 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3107 if (SMEOrigInstr != -1) {
3108 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3109 uint64_t SMEMatrixType =
3110 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3111 switch (SMEMatrixType) {
3113 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3115 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3117 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3119 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3121 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3123 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3124 }
3125 }
3126
3127 switch (MI.getOpcode()) {
3128 default:
3129#ifndef NDEBUG
3130 MI.dump();
3131#endif
3132 llvm_unreachable("Unexpected instruction for custom inserter!");
3133 case AArch64::InitTPIDR2Obj:
3134 return EmitInitTPIDR2Object(MI, BB);
3135 case AArch64::AllocateZABuffer:
3136 return EmitAllocateZABuffer(MI, BB);
3137 case AArch64::F128CSEL:
3138 return EmitF128CSEL(MI, BB);
3139 case TargetOpcode::STATEPOINT:
3140 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3141 // while bl call instruction (where statepoint will be lowered at the end)
3142 // has implicit def. This def is early-clobber as it will be set at
3143 // the moment of the call and earlier than any use is read.
3144 // Add this implicit dead def here as a workaround.
3145 MI.addOperand(*MI.getMF(),
3147 AArch64::LR, /*isDef*/ true,
3148 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3149 /*isUndef*/ false, /*isEarlyClobber*/ true));
3150 [[fallthrough]];
3151 case TargetOpcode::STACKMAP:
3152 case TargetOpcode::PATCHPOINT:
3153 return emitPatchPoint(MI, BB);
3154
3155 case TargetOpcode::PATCHABLE_EVENT_CALL:
3156 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3157 return BB;
3158
3159 case AArch64::CATCHRET:
3160 return EmitLoweredCatchRet(MI, BB);
3161
3162 case AArch64::PROBED_STACKALLOC_DYN:
3163 return EmitDynamicProbedAlloc(MI, BB);
3164
3165 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3166 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3167 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3168 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3169 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3170 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3171 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3172 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3173 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3174 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3175 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3176 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3177 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3178 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3179 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3180 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3181 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3182 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3183 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3184 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3185 case AArch64::LDR_ZA_PSEUDO:
3186 return EmitFill(MI, BB);
3187 case AArch64::LDR_TX_PSEUDO:
3188 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3189 case AArch64::STR_TX_PSEUDO:
3190 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3191 case AArch64::ZERO_M_PSEUDO:
3192 return EmitZero(MI, BB);
3193 case AArch64::ZERO_T_PSEUDO:
3194 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3195 }
3196}
3197
3198//===----------------------------------------------------------------------===//
3199// AArch64 Lowering private implementation.
3200//===----------------------------------------------------------------------===//
3201
3202//===----------------------------------------------------------------------===//
3203// Lowering Code
3204//===----------------------------------------------------------------------===//
3205
3206// Forward declarations of SVE fixed length lowering helpers
3211 SelectionDAG &DAG);
3214 EVT VT);
3215
3216/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3217static bool isZerosVector(const SDNode *N) {
3218 // Look through a bit convert.
3219 while (N->getOpcode() == ISD::BITCAST)
3220 N = N->getOperand(0).getNode();
3221
3223 return true;
3224
3225 if (N->getOpcode() != AArch64ISD::DUP)
3226 return false;
3227
3228 auto Opnd0 = N->getOperand(0);
3229 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3230}
3231
3232/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3233/// CC
3235 switch (CC) {
3236 default:
3237 llvm_unreachable("Unknown condition code!");
3238 case ISD::SETNE:
3239 return AArch64CC::NE;
3240 case ISD::SETEQ:
3241 return AArch64CC::EQ;
3242 case ISD::SETGT:
3243 return AArch64CC::GT;
3244 case ISD::SETGE:
3245 return AArch64CC::GE;
3246 case ISD::SETLT:
3247 return AArch64CC::LT;
3248 case ISD::SETLE:
3249 return AArch64CC::LE;
3250 case ISD::SETUGT:
3251 return AArch64CC::HI;
3252 case ISD::SETUGE:
3253 return AArch64CC::HS;
3254 case ISD::SETULT:
3255 return AArch64CC::LO;
3256 case ISD::SETULE:
3257 return AArch64CC::LS;
3258 }
3259}
3260
3261/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3263 AArch64CC::CondCode &CondCode,
3264 AArch64CC::CondCode &CondCode2) {
3265 CondCode2 = AArch64CC::AL;
3266 switch (CC) {
3267 default:
3268 llvm_unreachable("Unknown FP condition!");
3269 case ISD::SETEQ:
3270 case ISD::SETOEQ:
3271 CondCode = AArch64CC::EQ;
3272 break;
3273 case ISD::SETGT:
3274 case ISD::SETOGT:
3275 CondCode = AArch64CC::GT;
3276 break;
3277 case ISD::SETGE:
3278 case ISD::SETOGE:
3279 CondCode = AArch64CC::GE;
3280 break;
3281 case ISD::SETOLT:
3282 CondCode = AArch64CC::MI;
3283 break;
3284 case ISD::SETOLE:
3285 CondCode = AArch64CC::LS;
3286 break;
3287 case ISD::SETONE:
3288 CondCode = AArch64CC::MI;
3289 CondCode2 = AArch64CC::GT;
3290 break;
3291 case ISD::SETO:
3292 CondCode = AArch64CC::VC;
3293 break;
3294 case ISD::SETUO:
3295 CondCode = AArch64CC::VS;
3296 break;
3297 case ISD::SETUEQ:
3298 CondCode = AArch64CC::EQ;
3299 CondCode2 = AArch64CC::VS;
3300 break;
3301 case ISD::SETUGT:
3302 CondCode = AArch64CC::HI;
3303 break;
3304 case ISD::SETUGE:
3305 CondCode = AArch64CC::PL;
3306 break;
3307 case ISD::SETLT:
3308 case ISD::SETULT:
3309 CondCode = AArch64CC::LT;
3310 break;
3311 case ISD::SETLE:
3312 case ISD::SETULE:
3313 CondCode = AArch64CC::LE;
3314 break;
3315 case ISD::SETNE:
3316 case ISD::SETUNE:
3317 CondCode = AArch64CC::NE;
3318 break;
3319 }
3320}
3321
3322/// Convert a DAG fp condition code to an AArch64 CC.
3323/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3324/// should be AND'ed instead of OR'ed.
3326 AArch64CC::CondCode &CondCode,
3327 AArch64CC::CondCode &CondCode2) {
3328 CondCode2 = AArch64CC::AL;
3329 switch (CC) {
3330 default:
3331 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3332 assert(CondCode2 == AArch64CC::AL);
3333 break;
3334 case ISD::SETONE:
3335 // (a one b)
3336 // == ((a olt b) || (a ogt b))
3337 // == ((a ord b) && (a une b))
3338 CondCode = AArch64CC::VC;
3339 CondCode2 = AArch64CC::NE;
3340 break;
3341 case ISD::SETUEQ:
3342 // (a ueq b)
3343 // == ((a uno b) || (a oeq b))
3344 // == ((a ule b) && (a uge b))
3345 CondCode = AArch64CC::PL;
3346 CondCode2 = AArch64CC::LE;
3347 break;
3348 }
3349}
3350
3351/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3352/// CC usable with the vector instructions. Fewer operations are available
3353/// without a real NZCV register, so we have to use less efficient combinations
3354/// to get the same effect.
3356 AArch64CC::CondCode &CondCode,
3357 AArch64CC::CondCode &CondCode2,
3358 bool &Invert) {
3359 Invert = false;
3360 switch (CC) {
3361 default:
3362 // Mostly the scalar mappings work fine.
3363 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3364 break;
3365 case ISD::SETUO:
3366 Invert = true;
3367 [[fallthrough]];
3368 case ISD::SETO:
3369 CondCode = AArch64CC::MI;
3370 CondCode2 = AArch64CC::GE;
3371 break;
3372 case ISD::SETUEQ:
3373 case ISD::SETULT:
3374 case ISD::SETULE:
3375 case ISD::SETUGT:
3376 case ISD::SETUGE:
3377 // All of the compare-mask comparisons are ordered, but we can switch
3378 // between the two by a double inversion. E.g. ULE == !OGT.
3379 Invert = true;
3380 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3381 CondCode, CondCode2);
3382 break;
3383 }
3384}
3385
3387 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3388 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3389 LLVM_DEBUG(dbgs() << "Is imm " << C
3390 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3391 return IsLegal;
3392}
3393
3394// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3395// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3396// can be set differently by this operation. It comes down to whether
3397// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3398// everything is fine. If not then the optimization is wrong. Thus general
3399// comparisons are only valid if op2 != 0.
3400//
3401// So, finally, the only LLVM-native comparisons that don't mention C and V
3402// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3403// the absence of information about op2.
3405 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3406 (CC == ISD::SETEQ || CC == ISD::SETNE);
3407}
3408
3410 SelectionDAG &DAG, SDValue Chain,
3411 bool IsSignaling) {
3412 EVT VT = LHS.getValueType();
3413 assert(VT != MVT::f128);
3414
3415 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3416
3417 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3418 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3419 {Chain, LHS});
3420 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3421 {LHS.getValue(1), RHS});
3422 Chain = RHS.getValue(1);
3423 VT = MVT::f32;
3424 }
3425 unsigned Opcode =
3427 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3428}
3429
3431 const SDLoc &dl, SelectionDAG &DAG) {
3432 EVT VT = LHS.getValueType();
3433 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3434
3435 if (VT.isFloatingPoint()) {
3436 assert(VT != MVT::f128);
3437 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3438 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3439 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3440 VT = MVT::f32;
3441 }
3442 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3443 }
3444
3445 // The CMP instruction is just an alias for SUBS, and representing it as
3446 // SUBS means that it's possible to get CSE with subtract operations.
3447 // A later phase can perform the optimization of setting the destination
3448 // register to WZR/XZR if it ends up being unused.
3449 unsigned Opcode = AArch64ISD::SUBS;
3450
3451 if (isCMN(RHS, CC)) {
3452 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3453 Opcode = AArch64ISD::ADDS;
3454 RHS = RHS.getOperand(1);
3455 } else if (isCMN(LHS, CC)) {
3456 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3457 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3458 Opcode = AArch64ISD::ADDS;
3459 LHS = LHS.getOperand(1);
3460 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3461 if (LHS.getOpcode() == ISD::AND) {
3462 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3463 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3464 // of the signed comparisons.
3465 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3466 DAG.getVTList(VT, MVT_CC),
3467 LHS.getOperand(0),
3468 LHS.getOperand(1));
3469 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3470 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3471 return ANDSNode.getValue(1);
3472 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3473 // Use result of ANDS
3474 return LHS.getValue(1);
3475 }
3476 }
3477
3478 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3479 .getValue(1);
3480}
3481
3482/// \defgroup AArch64CCMP CMP;CCMP matching
3483///
3484/// These functions deal with the formation of CMP;CCMP;... sequences.
3485/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3486/// a comparison. They set the NZCV flags to a predefined value if their
3487/// predicate is false. This allows to express arbitrary conjunctions, for
3488/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3489/// expressed as:
3490/// cmp A
3491/// ccmp B, inv(CB), CA
3492/// check for CB flags
3493///
3494/// This naturally lets us implement chains of AND operations with SETCC
3495/// operands. And we can even implement some other situations by transforming
3496/// them:
3497/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3498/// negating the flags used in a CCMP/FCCMP operations.
3499/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3500/// by negating the flags we test for afterwards. i.e.
3501/// NEG (CMP CCMP CCCMP ...) can be implemented.
3502/// - Note that we can only ever negate all previously processed results.
3503/// What we can not implement by flipping the flags to test is a negation
3504/// of two sub-trees (because the negation affects all sub-trees emitted so
3505/// far, so the 2nd sub-tree we emit would also affect the first).
3506/// With those tools we can implement some OR operations:
3507/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3508/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3509/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3510/// elimination rules from earlier to implement the whole thing as a
3511/// CCMP/FCCMP chain.
3512///
3513/// As complete example:
3514/// or (or (setCA (cmp A)) (setCB (cmp B)))
3515/// (and (setCC (cmp C)) (setCD (cmp D)))"
3516/// can be reassociated to:
3517/// or (and (setCC (cmp C)) setCD (cmp D))
3518// (or (setCA (cmp A)) (setCB (cmp B)))
3519/// can be transformed to:
3520/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3521/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3522/// which can be implemented as:
3523/// cmp C
3524/// ccmp D, inv(CD), CC
3525/// ccmp A, CA, inv(CD)
3526/// ccmp B, CB, inv(CA)
3527/// check for CB flags
3528///
3529/// A counterexample is "or (and A B) (and C D)" which translates to
3530/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3531/// can only implement 1 of the inner (not) operations, but not both!
3532/// @{
3533
3534/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3536 ISD::CondCode CC, SDValue CCOp,
3537 AArch64CC::CondCode Predicate,
3538 AArch64CC::CondCode OutCC,
3539 const SDLoc &DL, SelectionDAG &DAG) {
3540 unsigned Opcode = 0;
3541 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3542
3543 if (LHS.getValueType().isFloatingPoint()) {
3544 assert(LHS.getValueType() != MVT::f128);
3545 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3546 LHS.getValueType() == MVT::bf16) {
3547 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3548 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3549 }
3550 Opcode = AArch64ISD::FCCMP;
3551 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3552 APInt Imm = Const->getAPIntValue();
3553 if (Imm.isNegative() && Imm.sgt(-32)) {
3554 Opcode = AArch64ISD::CCMN;
3555 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3556 }
3557 } else if (RHS.getOpcode() == ISD::SUB) {
3558 SDValue SubOp0 = RHS.getOperand(0);
3559 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3560 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3561 Opcode = AArch64ISD::CCMN;
3562 RHS = RHS.getOperand(1);
3563 }
3564 }
3565 if (Opcode == 0)
3566 Opcode = AArch64ISD::CCMP;
3567
3568 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3570 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3571 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3572 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3573}
3574
3575/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3576/// expressed as a conjunction. See \ref AArch64CCMP.
3577/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3578/// changing the conditions on the SETCC tests.
3579/// (this means we can call emitConjunctionRec() with
3580/// Negate==true on this sub-tree)
3581/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3582/// cannot do the negation naturally. We are required to
3583/// emit the subtree first in this case.
3584/// \param WillNegate Is true if are called when the result of this
3585/// subexpression must be negated. This happens when the
3586/// outer expression is an OR. We can use this fact to know
3587/// that we have a double negation (or (or ...) ...) that
3588/// can be implemented for free.
3589static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3590 bool &MustBeFirst, bool WillNegate,
3591 unsigned Depth = 0) {
3592 if (!Val.hasOneUse())
3593 return false;
3594 unsigned Opcode = Val->getOpcode();
3595 if (Opcode == ISD::SETCC) {
3596 if (Val->getOperand(0).getValueType() == MVT::f128)
3597 return false;
3598 CanNegate = true;
3599 MustBeFirst = false;
3600 return true;
3601 }
3602 // Protect against exponential runtime and stack overflow.
3603 if (Depth > 6)
3604 return false;
3605 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3606 bool IsOR = Opcode == ISD::OR;
3607 SDValue O0 = Val->getOperand(0);
3608 SDValue O1 = Val->getOperand(1);
3609 bool CanNegateL;
3610 bool MustBeFirstL;
3611 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3612 return false;
3613 bool CanNegateR;
3614 bool MustBeFirstR;
3615 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3616 return false;
3617
3618 if (MustBeFirstL && MustBeFirstR)
3619 return false;
3620
3621 if (IsOR) {
3622 // For an OR expression we need to be able to naturally negate at least
3623 // one side or we cannot do the transformation at all.
3624 if (!CanNegateL && !CanNegateR)
3625 return false;
3626 // If we the result of the OR will be negated and we can naturally negate
3627 // the leafs, then this sub-tree as a whole negates naturally.
3628 CanNegate = WillNegate && CanNegateL && CanNegateR;
3629 // If we cannot naturally negate the whole sub-tree, then this must be
3630 // emitted first.
3631 MustBeFirst = !CanNegate;
3632 } else {
3633 assert(Opcode == ISD::AND && "Must be OR or AND");
3634 // We cannot naturally negate an AND operation.
3635 CanNegate = false;
3636 MustBeFirst = MustBeFirstL || MustBeFirstR;
3637 }
3638 return true;
3639 }
3640 return false;
3641}
3642
3643/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3644/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3645/// Tries to transform the given i1 producing node @p Val to a series compare
3646/// and conditional compare operations. @returns an NZCV flags producing node
3647/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3648/// transformation was not possible.
3649/// \p Negate is true if we want this sub-tree being negated just by changing
3650/// SETCC conditions.
3652 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3653 AArch64CC::CondCode Predicate) {
3654 // We're at a tree leaf, produce a conditional comparison operation.
3655 unsigned Opcode = Val->getOpcode();
3656 if (Opcode == ISD::SETCC) {
3657 SDValue LHS = Val->getOperand(0);
3658 SDValue RHS = Val->getOperand(1);
3659 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3660 bool isInteger = LHS.getValueType().isInteger();
3661 if (Negate)
3662 CC = getSetCCInverse(CC, LHS.getValueType());
3663 SDLoc DL(Val);
3664 // Determine OutCC and handle FP special case.
3665 if (isInteger) {
3666 OutCC = changeIntCCToAArch64CC(CC);
3667 } else {
3668 assert(LHS.getValueType().isFloatingPoint());
3669 AArch64CC::CondCode ExtraCC;
3670 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3671 // Some floating point conditions can't be tested with a single condition
3672 // code. Construct an additional comparison in this case.
3673 if (ExtraCC != AArch64CC::AL) {
3674 SDValue ExtraCmp;
3675 if (!CCOp.getNode())
3676 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3677 else
3678 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3679 ExtraCC, DL, DAG);
3680 CCOp = ExtraCmp;
3681 Predicate = ExtraCC;
3682 }
3683 }
3684
3685 // Produce a normal comparison if we are first in the chain
3686 if (!CCOp)
3687 return emitComparison(LHS, RHS, CC, DL, DAG);
3688 // Otherwise produce a ccmp.
3689 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3690 DAG);
3691 }
3692 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3693
3694 bool IsOR = Opcode == ISD::OR;
3695
3696 SDValue LHS = Val->getOperand(0);
3697 bool CanNegateL;
3698 bool MustBeFirstL;
3699 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3700 assert(ValidL && "Valid conjunction/disjunction tree");
3701 (void)ValidL;
3702
3703 SDValue RHS = Val->getOperand(1);
3704 bool CanNegateR;
3705 bool MustBeFirstR;
3706 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3707 assert(ValidR && "Valid conjunction/disjunction tree");
3708 (void)ValidR;
3709
3710 // Swap sub-tree that must come first to the right side.
3711 if (MustBeFirstL) {
3712 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3713 std::swap(LHS, RHS);
3714 std::swap(CanNegateL, CanNegateR);
3715 std::swap(MustBeFirstL, MustBeFirstR);
3716 }
3717
3718 bool NegateR;
3719 bool NegateAfterR;
3720 bool NegateL;
3721 bool NegateAfterAll;
3722 if (Opcode == ISD::OR) {
3723 // Swap the sub-tree that we can negate naturally to the left.
3724 if (!CanNegateL) {
3725 assert(CanNegateR && "at least one side must be negatable");
3726 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3727 assert(!Negate);
3728 std::swap(LHS, RHS);
3729 NegateR = false;
3730 NegateAfterR = true;
3731 } else {
3732 // Negate the left sub-tree if possible, otherwise negate the result.
3733 NegateR = CanNegateR;
3734 NegateAfterR = !CanNegateR;
3735 }
3736 NegateL = true;
3737 NegateAfterAll = !Negate;
3738 } else {
3739 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3740 assert(!Negate && "Valid conjunction/disjunction tree");
3741
3742 NegateL = false;
3743 NegateR = false;
3744 NegateAfterR = false;
3745 NegateAfterAll = false;
3746 }
3747
3748 // Emit sub-trees.
3749 AArch64CC::CondCode RHSCC;
3750 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3751 if (NegateAfterR)
3752 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3753 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3754 if (NegateAfterAll)
3755 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3756 return CmpL;
3757}
3758
3759/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3760/// In some cases this is even possible with OR operations in the expression.
3761/// See \ref AArch64CCMP.
3762/// \see emitConjunctionRec().
3764 AArch64CC::CondCode &OutCC) {
3765 bool DummyCanNegate;
3766 bool DummyMustBeFirst;
3767 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3768 return SDValue();
3769
3770 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3771}
3772
3773/// @}
3774
3775/// Returns how profitable it is to fold a comparison's operand's shift and/or
3776/// extension operations.
3778 auto isSupportedExtend = [&](SDValue V) {
3779 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3780 return true;
3781
3782 if (V.getOpcode() == ISD::AND)
3783 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3784 uint64_t Mask = MaskCst->getZExtValue();
3785 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3786 }
3787
3788 return false;
3789 };
3790
3791 if (!Op.hasOneUse())
3792 return 0;
3793
3794 if (isSupportedExtend(Op))
3795 return 1;
3796
3797 unsigned Opc = Op.getOpcode();
3798 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3799 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3800 uint64_t Shift = ShiftCst->getZExtValue();
3801 if (isSupportedExtend(Op.getOperand(0)))
3802 return (Shift <= 4) ? 2 : 1;
3803 EVT VT = Op.getValueType();
3804 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3805 return 1;
3806 }
3807
3808 return 0;
3809}
3810
3812 SDValue &AArch64cc, SelectionDAG &DAG,
3813 const SDLoc &dl) {
3814 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3815 EVT VT = RHS.getValueType();
3816 uint64_t C = RHSC->getZExtValue();
3817 if (!isLegalArithImmed(C)) {
3818 // Constant does not fit, try adjusting it by one?
3819 switch (CC) {
3820 default:
3821 break;
3822 case ISD::SETLT:
3823 case ISD::SETGE:
3824 if ((VT == MVT::i32 && C != 0x80000000 &&
3825 isLegalArithImmed((uint32_t)(C - 1))) ||
3826 (VT == MVT::i64 && C != 0x80000000ULL &&
3827 isLegalArithImmed(C - 1ULL))) {
3829 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3830 RHS = DAG.getConstant(C, dl, VT);
3831 }
3832 break;
3833 case ISD::SETULT:
3834 case ISD::SETUGE:
3835 if ((VT == MVT::i32 && C != 0 &&
3836 isLegalArithImmed((uint32_t)(C - 1))) ||
3837 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3839 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3840 RHS = DAG.getConstant(C, dl, VT);
3841 }
3842 break;
3843 case ISD::SETLE:
3844 case ISD::SETGT:
3845 if ((VT == MVT::i32 && C != INT32_MAX &&
3846 isLegalArithImmed((uint32_t)(C + 1))) ||
3847 (VT == MVT::i64 && C != INT64_MAX &&
3848 isLegalArithImmed(C + 1ULL))) {
3850 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3851 RHS = DAG.getConstant(C, dl, VT);
3852 }
3853 break;
3854 case ISD::SETULE:
3855 case ISD::SETUGT:
3856 if ((VT == MVT::i32 && C != UINT32_MAX &&
3857 isLegalArithImmed((uint32_t)(C + 1))) ||
3858 (VT == MVT::i64 && C != UINT64_MAX &&
3859 isLegalArithImmed(C + 1ULL))) {
3861 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3862 RHS = DAG.getConstant(C, dl, VT);
3863 }
3864 break;
3865 }
3866 }
3867 }
3868
3869 // Comparisons are canonicalized so that the RHS operand is simpler than the
3870 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3871 // can fold some shift+extend operations on the RHS operand, so swap the
3872 // operands if that can be done.
3873 //
3874 // For example:
3875 // lsl w13, w11, #1
3876 // cmp w13, w12
3877 // can be turned into:
3878 // cmp w12, w11, lsl #1
3879 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3880 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3881
3883 std::swap(LHS, RHS);
3885 }
3886 }
3887
3888 SDValue Cmp;
3889 AArch64CC::CondCode AArch64CC;
3890 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3891 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3892
3893 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3894 // For the i8 operand, the largest immediate is 255, so this can be easily
3895 // encoded in the compare instruction. For the i16 operand, however, the
3896 // largest immediate cannot be encoded in the compare.
3897 // Therefore, use a sign extending load and cmn to avoid materializing the
3898 // -1 constant. For example,
3899 // movz w1, #65535
3900 // ldrh w0, [x0, #0]
3901 // cmp w0, w1
3902 // >
3903 // ldrsh w0, [x0, #0]
3904 // cmn w0, #1
3905 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3906 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3907 // ensure both the LHS and RHS are truly zero extended and to make sure the
3908 // transformation is profitable.
3909 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3910 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3911 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3912 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3913 int16_t ValueofRHS = RHS->getAsZExtVal();
3914 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3915 SDValue SExt =
3916 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3917 DAG.getValueType(MVT::i16));
3918 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3919 RHS.getValueType()),
3920 CC, dl, DAG);
3921 AArch64CC = changeIntCCToAArch64CC(CC);
3922 }
3923 }
3924
3925 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3926 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3927 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3928 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3929 }
3930 }
3931 }
3932
3933 if (!Cmp) {
3934 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3935 AArch64CC = changeIntCCToAArch64CC(CC);
3936 }
3937 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3938 return Cmp;
3939}
3940
3941static std::pair<SDValue, SDValue>
3943 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3944 "Unsupported value type");
3945 SDValue Value, Overflow;
3946 SDLoc DL(Op);
3947 SDValue LHS = Op.getOperand(0);
3948 SDValue RHS = Op.getOperand(1);
3949 unsigned Opc = 0;
3950 switch (Op.getOpcode()) {
3951 default:
3952 llvm_unreachable("Unknown overflow instruction!");
3953 case ISD::SADDO:
3954 Opc = AArch64ISD::ADDS;
3955 CC = AArch64CC::VS;
3956 break;
3957 case ISD::UADDO:
3958 Opc = AArch64ISD::ADDS;
3959 CC = AArch64CC::HS;
3960 break;
3961 case ISD::SSUBO:
3962 Opc = AArch64ISD::SUBS;
3963 CC = AArch64CC::VS;
3964 break;
3965 case ISD::USUBO:
3966 Opc = AArch64ISD::SUBS;
3967 CC = AArch64CC::LO;
3968 break;
3969 // Multiply needs a little bit extra work.
3970 case ISD::SMULO:
3971 case ISD::UMULO: {
3972 CC = AArch64CC::NE;
3973 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3974 if (Op.getValueType() == MVT::i32) {
3975 // Extend to 64-bits, then perform a 64-bit multiply.
3976 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3977 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3978 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3979 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3980 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3981
3982 // Check that the result fits into a 32-bit integer.
3983 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3984 if (IsSigned) {
3985 // cmp xreg, wreg, sxtw
3986 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3987 Overflow =
3988 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3989 } else {
3990 // tst xreg, #0xffffffff00000000
3991 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3992 Overflow =
3993 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3994 }
3995 break;
3996 }
3997 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3998 // For the 64 bit multiply
3999 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4000 if (IsSigned) {
4001 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4002 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4003 DAG.getConstant(63, DL, MVT::i64));
4004 // It is important that LowerBits is last, otherwise the arithmetic
4005 // shift will not be folded into the compare (SUBS).
4006 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4007 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4008 .getValue(1);
4009 } else {
4010 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4011 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4012 Overflow =
4013 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4014 DAG.getConstant(0, DL, MVT::i64),
4015 UpperBits).getValue(1);
4016 }
4017 break;
4018 }
4019 } // switch (...)
4020
4021 if (Opc) {
4022 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4023
4024 // Emit the AArch64 operation with overflow check.
4025 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4026 Overflow = Value.getValue(1);
4027 }
4028 return std::make_pair(Value, Overflow);
4029}
4030
4031SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4032 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4033 !Subtarget->isNeonAvailable()))
4034 return LowerToScalableOp(Op, DAG);
4035
4036 SDValue Sel = Op.getOperand(0);
4037 SDValue Other = Op.getOperand(1);
4038 SDLoc dl(Sel);
4039
4040 // If the operand is an overflow checking operation, invert the condition
4041 // code and kill the Not operation. I.e., transform:
4042 // (xor (overflow_op_bool, 1))
4043 // -->
4044 // (csel 1, 0, invert(cc), overflow_op_bool)
4045 // ... which later gets transformed to just a cset instruction with an
4046 // inverted condition code, rather than a cset + eor sequence.
4048 // Only lower legal XALUO ops.
4050 return SDValue();
4051
4052 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4053 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4055 SDValue Value, Overflow;
4056 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4057 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4058 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4059 CCVal, Overflow);
4060 }
4061 // If neither operand is a SELECT_CC, give up.
4062 if (Sel.getOpcode() != ISD::SELECT_CC)
4063 std::swap(Sel, Other);
4064 if (Sel.getOpcode() != ISD::SELECT_CC)
4065 return Op;
4066
4067 // The folding we want to perform is:
4068 // (xor x, (select_cc a, b, cc, 0, -1) )
4069 // -->
4070 // (csel x, (xor x, -1), cc ...)
4071 //
4072 // The latter will get matched to a CSINV instruction.
4073
4074 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4075 SDValue LHS = Sel.getOperand(0);
4076 SDValue RHS = Sel.getOperand(1);
4077 SDValue TVal = Sel.getOperand(2);
4078 SDValue FVal = Sel.getOperand(3);
4079
4080 // FIXME: This could be generalized to non-integer comparisons.
4081 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4082 return Op;
4083
4084 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4085 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4086
4087 // The values aren't constants, this isn't the pattern we're looking for.
4088 if (!CFVal || !CTVal)
4089 return Op;
4090
4091 // We can commute the SELECT_CC by inverting the condition. This
4092 // might be needed to make this fit into a CSINV pattern.
4093 if (CTVal->isAllOnes() && CFVal->isZero()) {
4094 std::swap(TVal, FVal);
4095 std::swap(CTVal, CFVal);
4096 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4097 }
4098
4099 // If the constants line up, perform the transform!
4100 if (CTVal->isZero() && CFVal->isAllOnes()) {
4101 SDValue CCVal;
4102 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4103
4104 FVal = Other;
4105 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4106 DAG.getConstant(-1ULL, dl, Other.getValueType()));
4107
4108 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4109 CCVal, Cmp);
4110 }
4111
4112 return Op;
4113}
4114
4115// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4116// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4117// sets 'C' bit to 0.
4119 SDLoc DL(Value);
4120 EVT VT = Value.getValueType();
4121 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4122 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4123 SDValue Cmp =
4124 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4125 return Cmp.getValue(1);
4126}
4127
4128// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4129// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4131 bool Invert) {
4132 assert(Glue.getResNo() == 1);
4133 SDLoc DL(Glue);
4134 SDValue Zero = DAG.getConstant(0, DL, VT);
4135 SDValue One = DAG.getConstant(1, DL, VT);
4136 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4137 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4138 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4139}
4140
4141// Value is 1 if 'V' bit of NZCV is 1, else 0
4143 assert(Glue.getResNo() == 1);
4144 SDLoc DL(Glue);
4145 SDValue Zero = DAG.getConstant(0, DL, VT);
4146 SDValue One = DAG.getConstant(1, DL, VT);
4147 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4148 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4149}
4150
4151// This lowering is inefficient, but it will get cleaned up by
4152// `foldOverflowCheck`
4154 unsigned Opcode, bool IsSigned) {
4155 EVT VT0 = Op.getValue(0).getValueType();
4156 EVT VT1 = Op.getValue(1).getValueType();
4157
4158 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4159 return SDValue();
4160
4161 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4162 SDValue OpLHS = Op.getOperand(0);
4163 SDValue OpRHS = Op.getOperand(1);
4164 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4165
4166 SDLoc DL(Op);
4167 SDVTList VTs = DAG.getVTList(VT0, VT1);
4168
4169 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4170 OpRHS, OpCarryIn);
4171
4172 SDValue OutFlag =
4173 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4174 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4175
4176 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4177}
4178
4180 // Let legalize expand this if it isn't a legal type yet.
4181 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4182 return SDValue();
4183
4184 SDLoc dl(Op);
4186 // The actual operation that sets the overflow or carry flag.
4187 SDValue Value, Overflow;
4188 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4189
4190 // We use 0 and 1 as false and true values.
4191 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4192 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4193
4194 // We use an inverted condition, because the conditional select is inverted
4195 // too. This will allow it to be selected to a single instruction:
4196 // CSINC Wd, WZR, WZR, invert(cond).
4197 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4198 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4199 CCVal, Overflow);
4200
4201 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4202 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4203}
4204
4205// Prefetch operands are:
4206// 1: Address to prefetch
4207// 2: bool isWrite
4208// 3: int locality (0 = no locality ... 3 = extreme locality)
4209// 4: bool isDataCache
4211 SDLoc DL(Op);
4212 unsigned IsWrite = Op.getConstantOperandVal(2);
4213 unsigned Locality = Op.getConstantOperandVal(3);
4214 unsigned IsData = Op.getConstantOperandVal(4);
4215
4216 bool IsStream = !Locality;
4217 // When the locality number is set
4218 if (Locality) {
4219 // The front-end should have filtered out the out-of-range values
4220 assert(Locality <= 3 && "Prefetch locality out-of-range");
4221 // The locality degree is the opposite of the cache speed.
4222 // Put the number the other way around.
4223 // The encoding starts at 0 for level 1
4224 Locality = 3 - Locality;
4225 }
4226
4227 // built the mask value encoding the expected behavior.
4228 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4229 (!IsData << 3) | // IsDataCache bit
4230 (Locality << 1) | // Cache level bits
4231 (unsigned)IsStream; // Stream bit
4232 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4233 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4234 Op.getOperand(1));
4235}
4236
4237SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4238 SelectionDAG &DAG) const {
4239 EVT VT = Op.getValueType();
4240 if (VT.isScalableVector())
4241 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4242
4243 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4244 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4245
4246 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4247 return SDValue();
4248}
4249
4250SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4251 SelectionDAG &DAG) const {
4252 EVT VT = Op.getValueType();
4253 if (VT.isScalableVector())
4254 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4255
4256 bool IsStrict = Op->isStrictFPOpcode();
4257 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4258 EVT SrcVT = SrcVal.getValueType();
4259 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4260
4261 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4262 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4263
4264 // Expand cases where the result type is BF16 but we don't have hardware
4265 // instructions to lower it.
4266 if (VT.getScalarType() == MVT::bf16 &&
4267 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4268 Subtarget->hasBF16())) {
4269 SDLoc dl(Op);
4270 SDValue Narrow = SrcVal;
4271 SDValue NaN;
4272 EVT I32 = SrcVT.changeElementType(MVT::i32);
4273 EVT F32 = SrcVT.changeElementType(MVT::f32);
4274 if (SrcVT.getScalarType() == MVT::f32) {
4275 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4276 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4277 if (!NeverSNaN) {
4278 // Set the quiet bit.
4279 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4280 DAG.getConstant(0x400000, dl, I32));
4281 }
4282 } else if (SrcVT.getScalarType() == MVT::f64) {
4283 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4284 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4285 } else {
4286 return SDValue();
4287 }
4288 if (!Trunc) {
4289 SDValue One = DAG.getConstant(1, dl, I32);
4290 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4291 DAG.getShiftAmountConstant(16, I32, dl));
4292 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4293 SDValue RoundingBias =
4294 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4295 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4296 }
4297
4298 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4299 // 0x80000000.
4300 if (NaN) {
4301 SDValue IsNaN = DAG.getSetCC(
4302 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4303 SrcVal, SrcVal, ISD::SETUO);
4304 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4305 }
4306
4307 // Now that we have rounded, shift the bits into position.
4308 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4309 DAG.getShiftAmountConstant(16, I32, dl));
4310 if (VT.isVector()) {
4311 EVT I16 = I32.changeVectorElementType(MVT::i16);
4312 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4313 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4314 }
4315 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4316 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4317 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4318 : Result;
4319 }
4320
4321 if (SrcVT != MVT::f128) {
4322 // Expand cases where the input is a vector bigger than NEON.
4324 return SDValue();
4325
4326 // It's legal except when f128 is involved
4327 return Op;
4328 }
4329
4330 return SDValue();
4331}
4332
4333SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4334 SelectionDAG &DAG) const {
4335 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4336 // Any additional optimization in this function should be recorded
4337 // in the cost tables.
4338 bool IsStrict = Op->isStrictFPOpcode();
4339 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4340 EVT VT = Op.getValueType();
4341
4342 if (VT.isScalableVector()) {
4343 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4346 return LowerToPredicatedOp(Op, DAG, Opcode);
4347 }
4348
4349 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4350 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4351 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4352
4353 unsigned NumElts = InVT.getVectorNumElements();
4354
4355 // f16 conversions are promoted to f32 when full fp16 is not supported.
4356 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4357 InVT.getVectorElementType() == MVT::bf16) {
4358 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4359 SDLoc dl(Op);
4360 if (IsStrict) {
4361 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4362 {Op.getOperand(0), Op.getOperand(1)});
4363 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4364 {Ext.getValue(1), Ext.getValue(0)});
4365 }
4366 return DAG.getNode(
4367 Op.getOpcode(), dl, Op.getValueType(),
4368 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4369 }
4370
4371 uint64_t VTSize = VT.getFixedSizeInBits();
4372 uint64_t InVTSize = InVT.getFixedSizeInBits();
4373 if (VTSize < InVTSize) {
4374 SDLoc dl(Op);
4375 if (IsStrict) {
4377 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4378 {Op.getOperand(0), Op.getOperand(1)});
4379 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4380 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4381 }
4382 SDValue Cv =
4383 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4384 Op.getOperand(0));
4385 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4386 }
4387
4388 if (VTSize > InVTSize) {
4389 SDLoc dl(Op);
4390 MVT ExtVT =
4393 if (IsStrict) {
4394 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4395 {Op.getOperand(0), Op.getOperand(1)});
4396 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4397 {Ext.getValue(1), Ext.getValue(0)});
4398 }
4399 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4400 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4401 }
4402
4403 // Use a scalar operation for conversions between single-element vectors of
4404 // the same size.
4405 if (NumElts == 1) {
4406 SDLoc dl(Op);
4407 SDValue Extract = DAG.getNode(
4409 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4410 EVT ScalarVT = VT.getScalarType();
4411 if (IsStrict)
4412 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4413 {Op.getOperand(0), Extract});
4414 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4415 }
4416
4417 // Type changing conversions are illegal.
4418 return Op;
4419}
4420
4421SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4422 SelectionDAG &DAG) const {
4423 bool IsStrict = Op->isStrictFPOpcode();
4424 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4425
4426 if (SrcVal.getValueType().isVector())
4427 return LowerVectorFP_TO_INT(Op, DAG);
4428
4429 // f16 conversions are promoted to f32 when full fp16 is not supported.
4430 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4431 SrcVal.getValueType() == MVT::bf16) {
4432 SDLoc dl(Op);
4433 if (IsStrict) {
4434 SDValue Ext =
4435 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4436 {Op.getOperand(0), SrcVal});
4437 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4438 {Ext.getValue(1), Ext.getValue(0)});
4439 }
4440 return DAG.getNode(
4441 Op.getOpcode(), dl, Op.getValueType(),
4442 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4443 }
4444
4445 if (SrcVal.getValueType() != MVT::f128) {
4446 // It's legal except when f128 is involved
4447 return Op;
4448 }
4449
4450 return SDValue();
4451}
4452
4453SDValue
4454AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4455 SelectionDAG &DAG) const {
4456 // AArch64 FP-to-int conversions saturate to the destination element size, so
4457 // we can lower common saturating conversions to simple instructions.
4458 SDValue SrcVal = Op.getOperand(0);
4459 EVT SrcVT = SrcVal.getValueType();
4460 EVT DstVT = Op.getValueType();
4461 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4462
4463 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4464 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4465 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4466 assert(SatWidth <= DstElementWidth &&
4467 "Saturation width cannot exceed result width");
4468
4469 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4470 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4471 // types, so this is hard to reach.
4472 if (DstVT.isScalableVector())
4473 return SDValue();
4474
4475 EVT SrcElementVT = SrcVT.getVectorElementType();
4476
4477 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4478 if ((SrcElementVT == MVT::f16 &&
4479 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4480 SrcElementVT == MVT::bf16) {
4481 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4482 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4483 SrcVT = F32VT;
4484 SrcElementVT = MVT::f32;
4485 SrcElementWidth = 32;
4486 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4487 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4488 return SDValue();
4489
4490 SDLoc DL(Op);
4491 // Expand to f64 if we are saturating to i64, to help produce keep the lanes
4492 // the same width and produce a fcvtzu.
4493 if (SatWidth == 64 && SrcElementWidth < 64) {
4494 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4495 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4496 SrcVT = F64VT;
4497 SrcElementVT = MVT::f64;
4498 SrcElementWidth = 64;
4499 }
4500 // Cases that we can emit directly.
4501 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4502 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4503 DAG.getValueType(DstVT.getScalarType()));
4504
4505 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4506 // result. This is only valid if the legal cvt is larger than the saturate
4507 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4508 // (at least until sqxtn is selected).
4509 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4510 return SDValue();
4511
4512 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4513 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4514 DAG.getValueType(IntVT.getScalarType()));
4515 SDValue Sat;
4516 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4517 SDValue MinC = DAG.getConstant(
4518 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4519 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4520 SDValue MaxC = DAG.getConstant(
4521 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4522 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4523 } else {
4524 SDValue MinC = DAG.getConstant(
4525 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4526 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4527 }
4528
4529 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4530}
4531
4532SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4533 SelectionDAG &DAG) const {
4534 // AArch64 FP-to-int conversions saturate to the destination register size, so
4535 // we can lower common saturating conversions to simple instructions.
4536 SDValue SrcVal = Op.getOperand(0);
4537 EVT SrcVT = SrcVal.getValueType();
4538
4539 if (SrcVT.isVector())
4540 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4541
4542 EVT DstVT = Op.getValueType();
4543 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4544 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4545 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4546 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4547
4548 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4549 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4550 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4551 SrcVT = MVT::f32;
4552 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4553 SrcVT != MVT::bf16)
4554 return SDValue();
4555
4556 SDLoc DL(Op);
4557 // Cases that we can emit directly.
4558 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4559 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4560 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4561 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4562 DAG.getValueType(DstVT));
4563
4564 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4565 // result. This is only valid if the legal cvt is larger than the saturate
4566 // width.
4567 if (DstWidth < SatWidth)
4568 return SDValue();
4569
4570 SDValue NativeCvt =
4571 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4572 SDValue Sat;
4573 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4574 SDValue MinC = DAG.getConstant(
4575 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4576 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4577 SDValue MaxC = DAG.getConstant(
4578 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4579 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4580 } else {
4581 SDValue MinC = DAG.getConstant(
4582 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4583 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4584 }
4585
4586 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4587}
4588
4589SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4590 SelectionDAG &DAG) const {
4591 EVT VT = Op.getValueType();
4592 SDValue Src = Op.getOperand(0);
4593 SDLoc DL(Op);
4594
4595 assert(VT.isVector() && "Expected vector type");
4596
4597 EVT CastVT =
4598 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4599
4600 // Round the floating-point value into a floating-point register with the
4601 // current rounding mode.
4602 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4603
4604 // Truncate the rounded floating point to an integer.
4605 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4607}
4608
4609SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4610 SelectionDAG &DAG) const {
4611 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4612 // Any additional optimization in this function should be recorded
4613 // in the cost tables.
4614 bool IsStrict = Op->isStrictFPOpcode();
4615 EVT VT = Op.getValueType();
4616 SDLoc dl(Op);
4617 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4618 EVT InVT = In.getValueType();
4619 unsigned Opc = Op.getOpcode();
4620 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4621
4622 if (VT.isScalableVector()) {
4623 if (InVT.getVectorElementType() == MVT::i1) {
4624 // We can't directly extend an SVE predicate; extend it first.
4625 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4626 EVT CastVT = getPromotedVTForPredicate(InVT);
4627 In = DAG.getNode(CastOpc, dl, CastVT, In);
4628 return DAG.getNode(Opc, dl, VT, In);
4629 }
4630
4631 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4633 return LowerToPredicatedOp(Op, DAG, Opcode);
4634 }
4635
4636 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4637 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4638 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4639
4640 // Promote bf16 conversions to f32.
4641 if (VT.getVectorElementType() == MVT::bf16) {
4642 EVT F32 = VT.changeElementType(MVT::f32);
4643 if (IsStrict) {
4644 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4645 {Op.getOperand(0), In});
4646 return DAG.getNode(
4647 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4648 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4649 }
4650 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4651 DAG.getNode(Op.getOpcode(), dl, F32, In),
4652 DAG.getIntPtrConstant(0, dl));
4653 }
4654
4655 uint64_t VTSize = VT.getFixedSizeInBits();
4656 uint64_t InVTSize = InVT.getFixedSizeInBits();
4657 if (VTSize < InVTSize) {
4658 MVT CastVT =
4660 InVT.getVectorNumElements());
4661 if (IsStrict) {
4662 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4663 {Op.getOperand(0), In});
4664 return DAG.getNode(
4665 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4666 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4667 }
4668 In = DAG.getNode(Opc, dl, CastVT, In);
4669 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4670 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4671 }
4672
4673 if (VTSize > InVTSize) {
4674 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4676 In = DAG.getNode(CastOpc, dl, CastVT, In);
4677 if (IsStrict)
4678 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4679 return DAG.getNode(Opc, dl, VT, In);
4680 }
4681
4682 // Use a scalar operation for conversions between single-element vectors of
4683 // the same size.
4684 if (VT.getVectorNumElements() == 1) {
4685 SDValue Extract = DAG.getNode(
4687 In, DAG.getConstant(0, dl, MVT::i64));
4688 EVT ScalarVT = VT.getScalarType();
4689 if (IsStrict)
4690 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4691 {Op.getOperand(0), Extract});
4692 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4693 }
4694
4695 return Op;
4696}
4697
4698SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4699 SelectionDAG &DAG) const {
4700 if (Op.getValueType().isVector())
4701 return LowerVectorINT_TO_FP(Op, DAG);
4702
4703 bool IsStrict = Op->isStrictFPOpcode();
4704 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4705
4706 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4707 Op->getOpcode() == ISD::SINT_TO_FP;
4708
4709 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4710 SDLoc dl(Op);
4711 if (IsStrict) {
4712 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4713 {Op.getOperand(0), SrcVal});
4714 return DAG.getNode(
4715 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4716 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4717 }
4718 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4719 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4720 DAG.getIntPtrConstant(0, dl));
4721 };
4722
4723 if (Op.getValueType() == MVT::bf16) {
4724 unsigned MaxWidth = IsSigned
4725 ? DAG.ComputeMaxSignificantBits(SrcVal)
4726 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4727 // bf16 conversions are promoted to f32 when converting from i16.
4728 if (MaxWidth <= 24) {
4729 return IntToFpViaPromotion(MVT::f32);
4730 }
4731
4732 // bf16 conversions are promoted to f64 when converting from i32.
4733 if (MaxWidth <= 53) {
4734 return IntToFpViaPromotion(MVT::f64);
4735 }
4736
4737 // We need to be careful about i64 -> bf16.
4738 // Consider an i32 22216703.
4739 // This number cannot be represented exactly as an f32 and so a itofp will
4740 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4741 // However, the correct bf16 was supposed to be 22151168.0
4742 // We need to use sticky rounding to get this correct.
4743 if (SrcVal.getValueType() == MVT::i64) {
4744 SDLoc DL(Op);
4745 // This algorithm is equivalent to the following:
4746 // uint64_t SrcHi = SrcVal & ~0xfffull;
4747 // uint64_t SrcLo = SrcVal & 0xfffull;
4748 // uint64_t Highest = SrcVal >> 53;
4749 // bool HasHighest = Highest != 0;
4750 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4751 // double Rounded = static_cast<double>(ToRound);
4752 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4753 // uint64_t HasLo = SrcLo != 0;
4754 // bool NeedsAdjustment = HasHighest & HasLo;
4755 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4756 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4757 // return static_cast<__bf16>(Adjusted);
4758 //
4759 // Essentially, what happens is that SrcVal either fits perfectly in a
4760 // double-precision value or it is too big. If it is sufficiently small,
4761 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4762 // ensure that u64 -> double has no rounding error by only using the 52
4763 // MSB of the input. The low order bits will get merged into a sticky bit
4764 // which will avoid issues incurred by double rounding.
4765
4766 // Signed conversion is more or less like so:
4767 // copysign((__bf16)abs(SrcVal), SrcVal)
4768 SDValue SignBit;
4769 if (IsSigned) {
4770 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4771 DAG.getConstant(1ull << 63, DL, MVT::i64));
4772 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4773 }
4774 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4775 DAG.getConstant(~0xfffull, DL, MVT::i64));
4776 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4777 DAG.getConstant(0xfffull, DL, MVT::i64));
4779 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4780 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4781 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4782 SDValue ToRound =
4783 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4784 SDValue Rounded =
4785 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4786 {Op.getOperand(0), ToRound})
4787 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4788
4789 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4790 if (SignBit) {
4791 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4792 }
4793
4794 SDValue HasHighest = DAG.getSetCC(
4795 DL,
4796 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4797 Highest, Zero64, ISD::SETNE);
4798
4799 SDValue HasLo = DAG.getSetCC(
4800 DL,
4801 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4802 SrcLo, Zero64, ISD::SETNE);
4803
4804 SDValue NeedsAdjustment =
4805 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4806 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4807
4808 SDValue AdjustedBits =
4809 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4810 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4811 return IsStrict
4813 {Op.getValueType(), MVT::Other},
4814 {Rounded.getValue(1), Adjusted,
4815 DAG.getIntPtrConstant(0, DL)})
4816 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4817 DAG.getIntPtrConstant(0, DL, true));
4818 }
4819 }
4820
4821 // f16 conversions are promoted to f32 when full fp16 is not supported.
4822 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4823 return IntToFpViaPromotion(MVT::f32);
4824 }
4825
4826 // i128 conversions are libcalls.
4827 if (SrcVal.getValueType() == MVT::i128)
4828 return SDValue();
4829
4830 // Other conversions are legal, unless it's to the completely software-based
4831 // fp128.
4832 if (Op.getValueType() != MVT::f128)
4833 return Op;
4834 return SDValue();
4835}
4836
4837SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4838 SelectionDAG &DAG) const {
4839 // For iOS, we want to call an alternative entry point: __sincos_stret,
4840 // which returns the values in two S / D registers.
4841 SDLoc dl(Op);
4842 SDValue Arg = Op.getOperand(0);
4843 EVT ArgVT = Arg.getValueType();
4844 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4845
4847 ArgListEntry Entry;
4848
4849 Entry.Node = Arg;
4850 Entry.Ty = ArgTy;
4851 Entry.IsSExt = false;
4852 Entry.IsZExt = false;
4853 Args.push_back(Entry);
4854
4855 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4856 : RTLIB::SINCOS_STRET_F32;
4857 const char *LibcallName = getLibcallName(LC);
4858 SDValue Callee =
4859 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4860
4861 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4863 CLI.setDebugLoc(dl)
4864 .setChain(DAG.getEntryNode())
4865 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4866
4867 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4868 return CallResult.first;
4869}
4870
4871static MVT getSVEContainerType(EVT ContentTy);
4872
4873SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4874 SelectionDAG &DAG) const {
4875 EVT OpVT = Op.getValueType();
4876 EVT ArgVT = Op.getOperand(0).getValueType();
4877
4879 return LowerFixedLengthBitcastToSVE(Op, DAG);
4880
4881 if (OpVT.isScalableVector()) {
4882 // Bitcasting between unpacked vector types of different element counts is
4883 // not a NOP because the live elements are laid out differently.
4884 // 01234567
4885 // e.g. nxv2i32 = XX??XX??
4886 // nxv4f16 = X?X?X?X?
4887 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4888 return SDValue();
4889
4890 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4891 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4892 "Expected int->fp bitcast!");
4893 SDValue ExtResult =
4895 Op.getOperand(0));
4896 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4897 }
4898 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4899 }
4900
4901 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4902 return SDValue();
4903
4904 // Bitcasts between f16 and bf16 are legal.
4905 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4906 return Op;
4907
4908 assert(ArgVT == MVT::i16);
4909 SDLoc DL(Op);
4910
4911 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4912 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4913 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4914}
4915
4916static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4917 if (OrigVT.getSizeInBits() >= 64)
4918 return OrigVT;
4919
4920 assert(OrigVT.isSimple() && "Expecting a simple value type");
4921
4922 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4923 switch (OrigSimpleTy) {
4924 default: llvm_unreachable("Unexpected Vector Type");
4925 case MVT::v2i8:
4926 case MVT::v2i16:
4927 return MVT::v2i32;
4928 case MVT::v4i8:
4929 return MVT::v4i16;
4930 }
4931}
4932
4934 const EVT &OrigTy,
4935 const EVT &ExtTy,
4936 unsigned ExtOpcode) {
4937 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4938 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4939 // 64-bits we need to insert a new extension so that it will be 64-bits.
4940 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4941 if (OrigTy.getSizeInBits() >= 64)
4942 return N;
4943
4944 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4945 EVT NewVT = getExtensionTo64Bits(OrigTy);
4946
4947 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4948}
4949
4950// Returns lane if Op extracts from a two-element vector and lane is constant
4951// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4952static std::optional<uint64_t>
4954 SDNode *OpNode = Op.getNode();
4955 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4956 return std::nullopt;
4957
4958 EVT VT = OpNode->getOperand(0).getValueType();
4959 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4960 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4961 return std::nullopt;
4962
4963 return C->getZExtValue();
4964}
4965
4967 bool isSigned) {
4968 EVT VT = N.getValueType();
4969
4970 if (N.getOpcode() != ISD::BUILD_VECTOR)
4971 return false;
4972
4973 for (const SDValue &Elt : N->op_values()) {
4974 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4975 unsigned EltSize = VT.getScalarSizeInBits();
4976 unsigned HalfSize = EltSize / 2;
4977 if (isSigned) {
4978 if (!isIntN(HalfSize, C->getSExtValue()))
4979 return false;
4980 } else {
4981 if (!isUIntN(HalfSize, C->getZExtValue()))
4982 return false;
4983 }
4984 continue;
4985 }
4986 return false;
4987 }
4988
4989 return true;
4990}
4991
4993 EVT VT = N.getValueType();
4994 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4995
4996 unsigned NumElts = VT.getVectorNumElements();
4997 unsigned OrigEltSize = VT.getScalarSizeInBits();
4998 unsigned EltSize = OrigEltSize / 2;
4999 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
5000
5001 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
5002 if (DAG.MaskedValueIsZero(N, HiBits))
5003 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
5004
5005 if (ISD::isExtOpcode(N.getOpcode()))
5006 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
5007 N.getOperand(0).getValueType(), VT,
5008 N.getOpcode());
5009
5010 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5011 SDLoc dl(N);
5013 for (unsigned i = 0; i != NumElts; ++i) {
5014 const APInt &CInt = N.getConstantOperandAPInt(i);
5015 // Element types smaller than 32 bits are not legal, so use i32 elements.
5016 // The values are implicitly truncated so sext vs. zext doesn't matter.
5017 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
5018 }
5019 return DAG.getBuildVector(TruncVT, dl, Ops);
5020}
5021
5023 return N.getOpcode() == ISD::SIGN_EXTEND ||
5024 N.getOpcode() == ISD::ANY_EXTEND ||
5025 isExtendedBUILD_VECTOR(N, DAG, true);
5026}
5027
5029 return N.getOpcode() == ISD::ZERO_EXTEND ||
5030 N.getOpcode() == ISD::ANY_EXTEND ||
5031 isExtendedBUILD_VECTOR(N, DAG, false);
5032}
5033
5035 unsigned Opcode = N.getOpcode();
5036 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5037 SDValue N0 = N.getOperand(0);
5038 SDValue N1 = N.getOperand(1);
5039 return N0->hasOneUse() && N1->hasOneUse() &&
5040 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5041 }
5042 return false;
5043}
5044
5046 unsigned Opcode = N.getOpcode();
5047 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5048 SDValue N0 = N.getOperand(0);
5049 SDValue N1 = N.getOperand(1);
5050 return N0->hasOneUse() && N1->hasOneUse() &&
5051 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5052 }
5053 return false;
5054}
5055
5056SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5057 SelectionDAG &DAG) const {
5058 // The rounding mode is in bits 23:22 of the FPSCR.
5059 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5060 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5061 // so that the shift + and get folded into a bitfield extract.
5062 SDLoc dl(Op);
5063
5064 SDValue Chain = Op.getOperand(0);
5065 SDValue FPCR_64 = DAG.getNode(
5066 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5067 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5068 Chain = FPCR_64.getValue(1);
5069 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5070 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5071 DAG.getConstant(1U << 22, dl, MVT::i32));
5072 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5073 DAG.getConstant(22, dl, MVT::i32));
5074 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5075 DAG.getConstant(3, dl, MVT::i32));
5076 return DAG.getMergeValues({AND, Chain}, dl);
5077}
5078
5079SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5080 SelectionDAG &DAG) const {
5081 SDLoc DL(Op);
5082 SDValue Chain = Op->getOperand(0);
5083 SDValue RMValue = Op->getOperand(1);
5084
5085 // The rounding mode is in bits 23:22 of the FPCR.
5086 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5087 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5088 // ((arg - 1) & 3) << 22).
5089 //
5090 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5091 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5092 // generated llvm.set.rounding to ensure this condition.
5093
5094 // Calculate new value of FPCR[23:22].
5095 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5096 DAG.getConstant(1, DL, MVT::i32));
5097 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5098 DAG.getConstant(0x3, DL, MVT::i32));
5099 RMValue =
5100 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5101 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5102 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5103
5104 // Get current value of FPCR.
5105 SDValue Ops[] = {
5106 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5107 SDValue FPCR =
5108 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5109 Chain = FPCR.getValue(1);
5110 FPCR = FPCR.getValue(0);
5111
5112 // Put new rounding mode into FPSCR[23:22].
5113 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5114 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5115 DAG.getConstant(RMMask, DL, MVT::i64));
5116 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5117 SDValue Ops2[] = {
5118 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5119 FPCR};
5120 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5121}
5122
5123SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5124 SelectionDAG &DAG) const {
5125 SDLoc DL(Op);
5126 SDValue Chain = Op->getOperand(0);
5127
5128 // Get current value of FPCR.
5129 SDValue Ops[] = {
5130 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5131 SDValue FPCR =
5132 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5133 Chain = FPCR.getValue(1);
5134 FPCR = FPCR.getValue(0);
5135
5136 // Truncate FPCR to 32 bits.
5137 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5138
5139 return DAG.getMergeValues({Result, Chain}, DL);
5140}
5141
5142SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5143 SelectionDAG &DAG) const {
5144 SDLoc DL(Op);
5145 SDValue Chain = Op->getOperand(0);
5146 SDValue Mode = Op->getOperand(1);
5147
5148 // Extend the specified value to 64 bits.
5149 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5150
5151 // Set new value of FPCR.
5152 SDValue Ops2[] = {
5153 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5154 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5155}
5156
5157SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5158 SelectionDAG &DAG) const {
5159 SDLoc DL(Op);
5160 SDValue Chain = Op->getOperand(0);
5161
5162 // Get current value of FPCR.
5163 SDValue Ops[] = {
5164 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5165 SDValue FPCR =
5166 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5167 Chain = FPCR.getValue(1);
5168 FPCR = FPCR.getValue(0);
5169
5170 // Clear bits that are not reserved.
5171 SDValue FPSCRMasked = DAG.getNode(
5172 ISD::AND, DL, MVT::i64, FPCR,
5174
5175 // Set new value of FPCR.
5176 SDValue Ops2[] = {Chain,
5177 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5178 FPSCRMasked};
5179 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5180}
5181
5182static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5183 SDLoc DL, bool &IsMLA) {
5184 bool IsN0SExt = isSignExtended(N0, DAG);
5185 bool IsN1SExt = isSignExtended(N1, DAG);
5186 if (IsN0SExt && IsN1SExt)
5187 return AArch64ISD::SMULL;
5188
5189 bool IsN0ZExt = isZeroExtended(N0, DAG);
5190 bool IsN1ZExt = isZeroExtended(N1, DAG);
5191
5192 if (IsN0ZExt && IsN1ZExt)
5193 return AArch64ISD::UMULL;
5194
5195 // Select SMULL if we can replace zext with sext.
5196 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
5197 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
5198 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
5199 SDValue ZextOperand;
5200 if (IsN0ZExt)
5201 ZextOperand = N0.getOperand(0);
5202 else
5203 ZextOperand = N1.getOperand(0);
5204 if (DAG.SignBitIsZero(ZextOperand)) {
5205 SDValue NewSext =
5206 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
5207 if (IsN0ZExt)
5208 N0 = NewSext;
5209 else
5210 N1 = NewSext;
5211 return AArch64ISD::SMULL;
5212 }
5213 }
5214
5215 // Select UMULL if we can replace the other operand with an extend.
5216 if (IsN0ZExt || IsN1ZExt) {
5217 EVT VT = N0.getValueType();
5219 VT.getScalarSizeInBits() / 2);
5220 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5221 return AArch64ISD::UMULL;
5222 }
5223
5224 if (!IsN1SExt && !IsN1ZExt)
5225 return 0;
5226
5227 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5228 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5229 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5230 IsMLA = true;
5231 return AArch64ISD::SMULL;
5232 }
5233 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5234 IsMLA = true;
5235 return AArch64ISD::UMULL;
5236 }
5237 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5238 std::swap(N0, N1);
5239 IsMLA = true;
5240 return AArch64ISD::UMULL;
5241 }
5242 return 0;
5243}
5244
5245SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5246 EVT VT = Op.getValueType();
5247
5248 bool OverrideNEON = !Subtarget->isNeonAvailable();
5249 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5250 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5251
5252 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5253 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5254 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5255 "unexpected type for custom-lowering ISD::MUL");
5256 SDValue N0 = Op.getOperand(0);
5257 SDValue N1 = Op.getOperand(1);
5258 bool isMLA = false;
5259 EVT OVT = VT;
5260 if (VT.is64BitVector()) {
5261 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5262 isNullConstant(N0.getOperand(1)) &&
5264 isNullConstant(N1.getOperand(1))) {
5265 N0 = N0.getOperand(0);
5266 N1 = N1.getOperand(0);
5267 VT = N0.getValueType();
5268 } else {
5269 if (VT == MVT::v1i64) {
5270 if (Subtarget->hasSVE())
5271 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5272 // Fall through to expand this. It is not legal.
5273 return SDValue();
5274 } else
5275 // Other vector multiplications are legal.
5276 return Op;
5277 }
5278 }
5279
5280 SDLoc DL(Op);
5281 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5282
5283 if (!NewOpc) {
5284 if (VT.getVectorElementType() == MVT::i64) {
5285 // If SVE is available then i64 vector multiplications can also be made
5286 // legal.
5287 if (Subtarget->hasSVE())
5288 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5289 // Fall through to expand this. It is not legal.
5290 return SDValue();
5291 } else
5292 // Other vector multiplications are legal.
5293 return Op;
5294 }
5295
5296 // Legalize to a S/UMULL instruction
5297 SDValue Op0;
5298 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5299 if (!isMLA) {
5300 Op0 = skipExtensionForVectorMULL(N0, DAG);
5302 Op1.getValueType().is64BitVector() &&
5303 "unexpected types for extended operands to VMULL");
5304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5305 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5306 DAG.getConstant(0, DL, MVT::i64));
5307 }
5308 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5309 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5310 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5313 EVT Op1VT = Op1.getValueType();
5314 return DAG.getNode(
5316 DAG.getNode(N0.getOpcode(), DL, VT,
5317 DAG.getNode(NewOpc, DL, VT,
5318 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5319 DAG.getNode(NewOpc, DL, VT,
5320 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5321 DAG.getConstant(0, DL, MVT::i64));
5322}
5323
5324static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5325 int Pattern) {
5326 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5327 return DAG.getConstant(1, DL, MVT::nxv1i1);
5328 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5329 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5330}
5331
5333 bool IsSigned, bool IsEqual) {
5334 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5335 !isa<ConstantSDNode>(Op.getOperand(2)))
5336 return SDValue();
5337
5338 SDLoc dl(Op);
5339 APInt X = Op.getConstantOperandAPInt(1);
5340 APInt Y = Op.getConstantOperandAPInt(2);
5341 bool Overflow;
5342 APInt NumActiveElems =
5343 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5344
5345 if (Overflow)
5346 return SDValue();
5347
5348 if (IsEqual) {
5349 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5350 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5351 : NumActiveElems.uadd_ov(One, Overflow);
5352 if (Overflow)
5353 return SDValue();
5354 }
5355
5356 std::optional<unsigned> PredPattern =
5358 unsigned MinSVEVectorSize = std::max(
5360 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5361 if (PredPattern != std::nullopt &&
5362 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5363 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5364
5365 return SDValue();
5366}
5367
5368// Returns a safe bitcast between two scalable vector predicates, where
5369// any newly created lanes from a widening bitcast are defined as zero.
5371 SDLoc DL(Op);
5372 EVT InVT = Op.getValueType();
5373
5374 assert(InVT.getVectorElementType() == MVT::i1 &&
5375 VT.getVectorElementType() == MVT::i1 &&
5376 "Expected a predicate-to-predicate bitcast");
5378 InVT.isScalableVector() &&
5379 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5380 "Only expect to cast between legal scalable predicate types!");
5381
5382 // Return the operand if the cast isn't changing type,
5383 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5384 if (InVT == VT)
5385 return Op;
5386
5387 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5388
5389 // We only have to zero the lanes if new lanes are being defined, e.g. when
5390 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5391 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5392 // we can return here.
5393 if (InVT.bitsGT(VT))
5394 return Reinterpret;
5395
5396 // Check if the other lanes are already known to be zeroed by
5397 // construction.
5399 return Reinterpret;
5400
5401 // Zero the newly introduced lanes.
5402 SDValue Mask = DAG.getConstant(1, DL, InVT);
5403 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5404 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5405}
5406
5407SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5408 SDValue Chain, SDLoc DL,
5409 EVT VT) const {
5410 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5412 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5413 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5416 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5418 RetTy, Callee, std::move(Args));
5419 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5420 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5421 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5422 Mask);
5423}
5424
5425// Lower an SME LDR/STR ZA intrinsic
5426// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5427// folded into the instruction
5428// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5429// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5430// and tile slice registers
5431// ldr(%tileslice, %ptr, %vecnum)
5432// ->
5433// %svl = rdsvl
5434// %ptr2 = %ptr + %svl * %vecnum
5435// %tileslice2 = %tileslice + %vecnum
5436// ldr [%tileslice2, 0], [%ptr2, 0]
5437// Case 3: If the vecnum is an immediate out of range, then the same is done as
5438// case 2, but the base and slice registers are modified by the greatest
5439// multiple of 15 lower than the vecnum and the remainder is folded into the
5440// instruction. This means that successive loads and stores that are offset from
5441// each other can share the same base and slice register updates.
5442// ldr(%tileslice, %ptr, 22)
5443// ldr(%tileslice, %ptr, 23)
5444// ->
5445// %svl = rdsvl
5446// %ptr2 = %ptr + %svl * 15
5447// %tileslice2 = %tileslice + 15
5448// ldr [%tileslice2, 7], [%ptr2, 7]
5449// ldr [%tileslice2, 8], [%ptr2, 8]
5450// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5451// operand and the immediate can be folded into the instruction, like case 2.
5452// ldr(%tileslice, %ptr, %vecnum + 7)
5453// ldr(%tileslice, %ptr, %vecnum + 8)
5454// ->
5455// %svl = rdsvl
5456// %ptr2 = %ptr + %svl * %vecnum
5457// %tileslice2 = %tileslice + %vecnum
5458// ldr [%tileslice2, 7], [%ptr2, 7]
5459// ldr [%tileslice2, 8], [%ptr2, 8]
5460// Case 5: The vecnum being an add of an immediate out of range is also handled,
5461// in which case the same remainder logic as case 3 is used.
5463 SDLoc DL(N);
5464
5465 SDValue TileSlice = N->getOperand(2);
5466 SDValue Base = N->getOperand(3);
5467 SDValue VecNum = N->getOperand(4);
5468 int32_t ConstAddend = 0;
5469 SDValue VarAddend = VecNum;
5470
5471 // If the vnum is an add of an immediate, we can fold it into the instruction
5472 if (VecNum.getOpcode() == ISD::ADD &&
5473 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5474 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5475 VarAddend = VecNum.getOperand(0);
5476 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5477 ConstAddend = ImmNode->getSExtValue();
5478 VarAddend = SDValue();
5479 }
5480
5481 int32_t ImmAddend = ConstAddend % 16;
5482 if (int32_t C = (ConstAddend - ImmAddend)) {
5483 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5484 VarAddend = VarAddend
5485 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5486 : CVal;
5487 }
5488
5489 if (VarAddend) {
5490 // Get the vector length that will be multiplied by vnum
5491 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5492 DAG.getConstant(1, DL, MVT::i32));
5493
5494 // Multiply SVL and vnum then add it to the base
5495 SDValue Mul = DAG.getNode(
5496 ISD::MUL, DL, MVT::i64,
5497 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5498 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5499 // Just add vnum to the tileslice
5500 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5501 }
5502
5504 DL, MVT::Other,
5505 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5506 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5507}
5508
5509SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5510 SelectionDAG &DAG) const {
5511 unsigned IntNo = Op.getConstantOperandVal(1);
5512 SDLoc DL(Op);
5513 switch (IntNo) {
5514 default:
5515 return SDValue(); // Don't custom lower most intrinsics.
5516 case Intrinsic::aarch64_prefetch: {
5517 SDValue Chain = Op.getOperand(0);
5518 SDValue Addr = Op.getOperand(2);
5519
5520 unsigned IsWrite = Op.getConstantOperandVal(3);
5521 unsigned Locality = Op.getConstantOperandVal(4);
5522 unsigned IsStream = Op.getConstantOperandVal(5);
5523 unsigned IsData = Op.getConstantOperandVal(6);
5524 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5525 (!IsData << 3) | // IsDataCache bit
5526 (Locality << 1) | // Cache level bits
5527 (unsigned)IsStream; // Stream bit
5528
5529 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5530 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5531 }
5532 case Intrinsic::aarch64_sme_str:
5533 case Intrinsic::aarch64_sme_ldr: {
5534 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5535 }
5536 case Intrinsic::aarch64_sme_za_enable:
5537 return DAG.getNode(
5538 AArch64ISD::SMSTART, DL, MVT::Other,
5539 Op->getOperand(0), // Chain
5540 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5541 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5542 case Intrinsic::aarch64_sme_za_disable:
5543 return DAG.getNode(
5544 AArch64ISD::SMSTOP, DL, MVT::Other,
5545 Op->getOperand(0), // Chain
5546 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5547 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5548 }
5549}
5550
5551SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5552 SelectionDAG &DAG) const {
5553 unsigned IntNo = Op.getConstantOperandVal(1);
5554 SDLoc DL(Op);
5555 switch (IntNo) {
5556 default:
5557 return SDValue(); // Don't custom lower most intrinsics.
5558 case Intrinsic::aarch64_mops_memset_tag: {
5559 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5560 SDValue Chain = Node->getChain();
5561 SDValue Dst = Op.getOperand(2);
5562 SDValue Val = Op.getOperand(3);
5563 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5564 SDValue Size = Op.getOperand(4);
5565 auto Alignment = Node->getMemOperand()->getAlign();
5566 bool IsVol = Node->isVolatile();
5567 auto DstPtrInfo = Node->getPointerInfo();
5568
5569 const auto &SDI =
5570 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5571 SDValue MS =
5572 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5573 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5574
5575 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5576 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5577 // LowerOperationWrapper will complain that the number of results has
5578 // changed.
5579 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5580 }
5581 }
5582}
5583
5584SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5585 SelectionDAG &DAG) const {
5586 unsigned IntNo = Op.getConstantOperandVal(0);
5587 SDLoc dl(Op);
5588 switch (IntNo) {
5589 default: return SDValue(); // Don't custom lower most intrinsics.
5590 case Intrinsic::thread_pointer: {
5591 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5592 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5593 }
5594 case Intrinsic::aarch64_neon_abs: {
5595 EVT Ty = Op.getValueType();
5596 if (Ty == MVT::i64) {
5597 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5598 Op.getOperand(1));
5599 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5600 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5601 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5602 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5603 } else {
5604 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5605 }
5606 }
5607 case Intrinsic::aarch64_neon_pmull64: {
5608 SDValue LHS = Op.getOperand(1);
5609 SDValue RHS = Op.getOperand(2);
5610
5611 std::optional<uint64_t> LHSLane =
5613 std::optional<uint64_t> RHSLane =
5615
5616 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5617 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5618
5619 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5620 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5621 // which ISel recognizes better. For example, generate a ldr into d*
5622 // registers as opposed to a GPR load followed by a fmov.
5623 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5624 std::optional<uint64_t> OtherLane,
5625 const SDLoc &dl,
5626 SelectionDAG &DAG) -> SDValue {
5627 // If the operand is an higher half itself, rewrite it to
5628 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5629 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5630 if (NLane && *NLane == 1)
5631 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5632 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5633
5634 // Operand N is not a higher half but the other operand is.
5635 if (OtherLane && *OtherLane == 1) {
5636 // If this operand is a lower half, rewrite it to
5637 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5638 // align lanes of two operands. A roundtrip sequence (to move from lane
5639 // 1 to lane 0) is like this:
5640 // mov x8, v0.d[1]
5641 // fmov d0, x8
5642 if (NLane && *NLane == 0)
5643 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5644 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5645 N.getOperand(0),
5646 DAG.getConstant(0, dl, MVT::i64)),
5647 DAG.getConstant(1, dl, MVT::i64));
5648
5649 // Otherwise just dup from main to all lanes.
5650 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5651 }
5652
5653 // Neither operand is an extract of higher half, so codegen may just use
5654 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5655 assert(N.getValueType() == MVT::i64 &&
5656 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5657 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5658 };
5659
5660 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5661 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5662
5663 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5664 }
5665 case Intrinsic::aarch64_neon_smax:
5666 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5667 Op.getOperand(1), Op.getOperand(2));
5668 case Intrinsic::aarch64_neon_umax:
5669 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5670 Op.getOperand(1), Op.getOperand(2));
5671 case Intrinsic::aarch64_neon_smin:
5672 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5673 Op.getOperand(1), Op.getOperand(2));
5674 case Intrinsic::aarch64_neon_umin:
5675 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5676 Op.getOperand(1), Op.getOperand(2));
5677 case Intrinsic::aarch64_neon_scalar_sqxtn:
5678 case Intrinsic::aarch64_neon_scalar_sqxtun:
5679 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5680 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5681 if (Op.getValueType() == MVT::i32)
5682 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5683 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5684 Op.getOperand(0),
5685 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5686 Op.getOperand(1))));
5687 return SDValue();
5688 }
5689 case Intrinsic::aarch64_sve_whilelo:
5690 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5691 /*IsEqual=*/false);
5692 case Intrinsic::aarch64_sve_whilelt:
5693 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5694 /*IsEqual=*/false);
5695 case Intrinsic::aarch64_sve_whilels:
5696 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5697 /*IsEqual=*/true);
5698 case Intrinsic::aarch64_sve_whilele:
5699 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5700 /*IsEqual=*/true);
5701 case Intrinsic::aarch64_sve_sunpkhi:
5702 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5703 Op.getOperand(1));
5704 case Intrinsic::aarch64_sve_sunpklo:
5705 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5706 Op.getOperand(1));
5707 case Intrinsic::aarch64_sve_uunpkhi:
5708 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5709 Op.getOperand(1));
5710 case Intrinsic::aarch64_sve_uunpklo:
5711 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5712 Op.getOperand(1));
5713 case Intrinsic::aarch64_sve_clasta_n:
5714 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5715 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5716 case Intrinsic::aarch64_sve_clastb_n:
5717 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5718 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5719 case Intrinsic::aarch64_sve_lasta:
5720 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5721 Op.getOperand(1), Op.getOperand(2));
5722 case Intrinsic::aarch64_sve_lastb:
5723 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5724 Op.getOperand(1), Op.getOperand(2));
5725 case Intrinsic::aarch64_sve_rev:
5726 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5727 Op.getOperand(1));
5728 case Intrinsic::aarch64_sve_tbl:
5729 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5730 Op.getOperand(1), Op.getOperand(2));
5731 case Intrinsic::aarch64_sve_trn1:
5732 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5733 Op.getOperand(1), Op.getOperand(2));
5734 case Intrinsic::aarch64_sve_trn2:
5735 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5736 Op.getOperand(1), Op.getOperand(2));
5737 case Intrinsic::aarch64_sve_uzp1:
5738 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5739 Op.getOperand(1), Op.getOperand(2));
5740 case Intrinsic::aarch64_sve_uzp2:
5741 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5742 Op.getOperand(1), Op.getOperand(2));
5743 case Intrinsic::aarch64_sve_zip1:
5744 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5745 Op.getOperand(1), Op.getOperand(2));
5746 case Intrinsic::aarch64_sve_zip2:
5747 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5748 Op.getOperand(1), Op.getOperand(2));
5749 case Intrinsic::aarch64_sve_splice:
5750 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5751 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5752 case Intrinsic::aarch64_sve_ptrue:
5753 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5754 case Intrinsic::aarch64_sve_clz:
5755 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5756 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5757 case Intrinsic::aarch64_sme_cntsb:
5758 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5759 DAG.getConstant(1, dl, MVT::i32));
5760 case Intrinsic::aarch64_sme_cntsh: {
5761 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5762 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5763 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5764 }
5765 case Intrinsic::aarch64_sme_cntsw: {
5766 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5767 DAG.getConstant(1, dl, MVT::i32));
5768 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5769 DAG.getConstant(2, dl, MVT::i32));
5770 }
5771 case Intrinsic::aarch64_sme_cntsd: {
5772 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5773 DAG.getConstant(1, dl, MVT::i32));
5774 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5775 DAG.getConstant(3, dl, MVT::i32));
5776 }
5777 case Intrinsic::aarch64_sve_cnt: {
5778 SDValue Data = Op.getOperand(3);
5779 // CTPOP only supports integer operands.
5780 if (Data.getValueType().isFloatingPoint())
5781 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5782 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5783 Op.getOperand(2), Data, Op.getOperand(1));
5784 }
5785 case Intrinsic::aarch64_sve_dupq_lane:
5786 return LowerDUPQLane(Op, DAG);
5787 case Intrinsic::aarch64_sve_convert_from_svbool:
5788 if (Op.getValueType() == MVT::aarch64svcount)
5789 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5790 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5791 case Intrinsic::aarch64_sve_convert_to_svbool:
5792 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5793 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5794 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5795 case Intrinsic::aarch64_sve_fneg:
5796 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5797 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5798 case Intrinsic::aarch64_sve_frintp:
5799 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5800 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5801 case Intrinsic::aarch64_sve_frintm:
5802 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5803 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5804 case Intrinsic::aarch64_sve_frinti:
5805 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5806 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5807 case Intrinsic::aarch64_sve_frintx:
5808 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5809 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5810 case Intrinsic::aarch64_sve_frinta:
5811 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5812 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5813 case Intrinsic::aarch64_sve_frintn:
5814 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5815 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5816 case Intrinsic::aarch64_sve_frintz:
5817 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5818 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5819 case Intrinsic::aarch64_sve_ucvtf:
5821 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5822 Op.getOperand(1));
5823 case Intrinsic::aarch64_sve_scvtf:
5825 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5826 Op.getOperand(1));
5827 case Intrinsic::aarch64_sve_fcvtzu:
5829 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5830 Op.getOperand(1));
5831 case Intrinsic::aarch64_sve_fcvtzs:
5833 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5834 Op.getOperand(1));
5835 case Intrinsic::aarch64_sve_fsqrt:
5836 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5837 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5838 case Intrinsic::aarch64_sve_frecpx:
5839 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5840 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5841 case Intrinsic::aarch64_sve_frecpe_x:
5842 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5843 Op.getOperand(1));
5844 case Intrinsic::aarch64_sve_frecps_x:
5845 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5846 Op.getOperand(1), Op.getOperand(2));
5847 case Intrinsic::aarch64_sve_frsqrte_x:
5848 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5849 Op.getOperand(1));
5850 case Intrinsic::aarch64_sve_frsqrts_x:
5851 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5852 Op.getOperand(1), Op.getOperand(2));
5853 case Intrinsic::aarch64_sve_fabs:
5854 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5855 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5856 case Intrinsic::aarch64_sve_abs:
5857 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5858 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5859 case Intrinsic::aarch64_sve_neg:
5860 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5861 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5862 case Intrinsic::aarch64_sve_insr: {
5863 SDValue Scalar = Op.getOperand(2);
5864 EVT ScalarTy = Scalar.getValueType();
5865 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5866 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5867
5868 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5869 Op.getOperand(1), Scalar);
5870 }
5871 case Intrinsic::aarch64_sve_rbit:
5873 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5874 Op.getOperand(1));
5875 case Intrinsic::aarch64_sve_revb:
5876 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5877 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5878 case Intrinsic::aarch64_sve_revh:
5879 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5880 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5881 case Intrinsic::aarch64_sve_revw:
5882 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5883 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5884 case Intrinsic::aarch64_sve_revd:
5885 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5886 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5887 case Intrinsic::aarch64_sve_sxtb:
5888 return DAG.getNode(
5890 Op.getOperand(2), Op.getOperand(3),
5891 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5892 Op.getOperand(1));
5893 case Intrinsic::aarch64_sve_sxth:
5894 return DAG.getNode(
5896 Op.getOperand(2), Op.getOperand(3),
5897 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5898 Op.getOperand(1));
5899 case Intrinsic::aarch64_sve_sxtw:
5900 return DAG.getNode(
5902 Op.getOperand(2), Op.getOperand(3),
5903 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5904 Op.getOperand(1));
5905 case Intrinsic::aarch64_sve_uxtb:
5906 return DAG.getNode(
5908 Op.getOperand(2), Op.getOperand(3),
5909 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5910 Op.getOperand(1));
5911 case Intrinsic::aarch64_sve_uxth:
5912 return DAG.getNode(
5914 Op.getOperand(2), Op.getOperand(3),
5915 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5916 Op.getOperand(1));
5917 case Intrinsic::aarch64_sve_uxtw:
5918 return DAG.getNode(
5920 Op.getOperand(2), Op.getOperand(3),
5921 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5922 Op.getOperand(1));
5923 case Intrinsic::localaddress: {
5924 const auto &MF = DAG.getMachineFunction();
5925 const auto *RegInfo = Subtarget->getRegisterInfo();
5926 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5927 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5928 Op.getSimpleValueType());
5929 }
5930
5931 case Intrinsic::eh_recoverfp: {
5932 // FIXME: This needs to be implemented to correctly handle highly aligned
5933 // stack objects. For now we simply return the incoming FP. Refer D53541
5934 // for more details.
5935 SDValue FnOp = Op.getOperand(1);
5936 SDValue IncomingFPOp = Op.getOperand(2);
5937 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5938 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5939 if (!Fn)
5941 "llvm.eh.recoverfp must take a function as the first argument");
5942 return IncomingFPOp;
5943 }
5944
5945 case Intrinsic::aarch64_neon_vsri:
5946 case Intrinsic::aarch64_neon_vsli:
5947 case Intrinsic::aarch64_sve_sri:
5948 case Intrinsic::aarch64_sve_sli: {
5949 EVT Ty = Op.getValueType();
5950
5951 if (!Ty.isVector())
5952 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5953
5954 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5955
5956 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5957 IntNo == Intrinsic::aarch64_sve_sri;
5958 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5959 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5960 Op.getOperand(3));
5961 }
5962
5963 case Intrinsic::aarch64_neon_srhadd:
5964 case Intrinsic::aarch64_neon_urhadd:
5965 case Intrinsic::aarch64_neon_shadd:
5966 case Intrinsic::aarch64_neon_uhadd: {
5967 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5968 IntNo == Intrinsic::aarch64_neon_shadd);
5969 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5970 IntNo == Intrinsic::aarch64_neon_urhadd);
5971 unsigned Opcode = IsSignedAdd
5972 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5973 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5974 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5975 Op.getOperand(2));
5976 }
5977 case Intrinsic::aarch64_neon_saddlp:
5978 case Intrinsic::aarch64_neon_uaddlp: {
5979 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5982 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5983 }
5984 case Intrinsic::aarch64_neon_sdot:
5985 case Intrinsic::aarch64_neon_udot:
5986 case Intrinsic::aarch64_sve_sdot:
5987 case Intrinsic::aarch64_sve_udot: {
5988 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5989 IntNo == Intrinsic::aarch64_sve_udot)
5992 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5993 Op.getOperand(2), Op.getOperand(3));
5994 }
5995 case Intrinsic::get_active_lane_mask: {
5996 SDValue ID =
5997 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5998
5999 EVT VT = Op.getValueType();
6000 if (VT.isScalableVector())
6001 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6002 Op.getOperand(2));
6003
6004 // We can use the SVE whilelo instruction to lower this intrinsic by
6005 // creating the appropriate sequence of scalable vector operations and
6006 // then extracting a fixed-width subvector from the scalable vector.
6007
6008 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6009 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6010
6011 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6012 Op.getOperand(1), Op.getOperand(2));
6013 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6014 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6015 DAG.getVectorIdxConstant(0, dl));
6016 }
6017 case Intrinsic::aarch64_neon_uaddlv: {
6018 EVT OpVT = Op.getOperand(1).getValueType();
6019 EVT ResVT = Op.getValueType();
6020 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6021 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
6022 // In order to avoid insert_subvector, used v4i32 than v2i32.
6023 SDValue UADDLV =
6024 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
6025 SDValue EXTRACT_VEC_ELT =
6026 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
6027 DAG.getConstant(0, dl, MVT::i64));
6028 return EXTRACT_VEC_ELT;
6029 }
6030 return SDValue();
6031 }
6032 case Intrinsic::experimental_cttz_elts: {
6033 SDValue CttzOp = Op.getOperand(1);
6034 EVT VT = CttzOp.getValueType();
6035 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6036
6037 if (VT.isFixedLengthVector()) {
6038 // We can use SVE instructions to lower this intrinsic by first creating
6039 // an SVE predicate register mask from the fixed-width vector.
6040 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6041 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6042 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6043 }
6044
6045 SDValue NewCttzElts =
6046 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6047 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6048 }
6049 }
6050}
6051
6052bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6053 if (VT.getVectorElementType() == MVT::i8 ||
6054 VT.getVectorElementType() == MVT::i16) {
6055 EltTy = MVT::i32;
6056 return true;
6057 }
6058 return false;
6059}
6060
6061bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6062 EVT DataVT) const {
6063 const EVT IndexVT = Extend.getOperand(0).getValueType();
6064 // SVE only supports implicit extension of 32-bit indices.
6065 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6066 return false;
6067
6068 // Indices cannot be smaller than the main data type.
6069 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6070 return false;
6071
6072 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6073 // element container type, which would violate the previous clause.
6074 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6075}
6076
6077bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6078 EVT ExtVT = ExtVal.getValueType();
6079 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6080 return false;
6081
6082 // It may be worth creating extending masked loads if there are multiple
6083 // masked loads using the same predicate. That way we'll end up creating
6084 // extending masked loads that may then get split by the legaliser. This
6085 // results in just one set of predicate unpacks at the start, instead of
6086 // multiple sets of vector unpacks after each load.
6087 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6088 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6089 // Disable extending masked loads for fixed-width for now, since the code
6090 // quality doesn't look great.
6091 if (!ExtVT.isScalableVector())
6092 return false;
6093
6094 unsigned NumExtMaskedLoads = 0;
6095 for (auto *U : Ld->getMask()->uses())
6096 if (isa<MaskedLoadSDNode>(U))
6097 NumExtMaskedLoads++;
6098
6099 if (NumExtMaskedLoads <= 1)
6100 return false;
6101 }
6102 }
6103
6104 return true;
6105}
6106
6107unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6108 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6109 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6111 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6113 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6115 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6117 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6119 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6121 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6123 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6125 };
6126 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6127 return AddrModes.find(Key)->second;
6128}
6129
6130unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6131 switch (Opcode) {
6132 default:
6133 llvm_unreachable("unimplemented opcode");
6134 return Opcode;
6149 }
6150}
6151
6152SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6153 SelectionDAG &DAG) const {
6154 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6155
6156 SDLoc DL(Op);
6157 SDValue Chain = MGT->getChain();
6158 SDValue PassThru = MGT->getPassThru();
6159 SDValue Mask = MGT->getMask();
6160 SDValue BasePtr = MGT->getBasePtr();
6161 SDValue Index = MGT->getIndex();
6162 SDValue Scale = MGT->getScale();
6163 EVT VT = Op.getValueType();
6164 EVT MemVT = MGT->getMemoryVT();
6165 ISD::LoadExtType ExtType = MGT->getExtensionType();
6166 ISD::MemIndexType IndexType = MGT->getIndexType();
6167
6168 // SVE supports zero (and so undef) passthrough values only, everything else
6169 // must be handled manually by an explicit select on the load's output.
6170 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6171 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6172 SDValue Load =
6173 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6174 MGT->getMemOperand(), IndexType, ExtType);
6175 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6176 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6177 }
6178
6179 bool IsScaled = MGT->isIndexScaled();
6180 bool IsSigned = MGT->isIndexSigned();
6181
6182 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6183 // must be calculated before hand.
6184 uint64_t ScaleVal = Scale->getAsZExtVal();
6185 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6186 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6187 EVT IndexVT = Index.getValueType();
6188 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6189 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6190 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6191
6192 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6193 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6194 MGT->getMemOperand(), IndexType, ExtType);
6195 }
6196
6197 // Lower fixed length gather to a scalable equivalent.
6198 if (VT.isFixedLengthVector()) {
6199 assert(Subtarget->useSVEForFixedLengthVectors() &&
6200 "Cannot lower when not using SVE for fixed vectors!");
6201
6202 // NOTE: Handle floating-point as if integer then bitcast the result.
6204 MemVT = MemVT.changeVectorElementTypeToInteger();
6205
6206 // Find the smallest integer fixed length vector we can use for the gather.
6207 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6208 if (DataVT.getVectorElementType() == MVT::i64 ||
6209 Index.getValueType().getVectorElementType() == MVT::i64 ||
6210 Mask.getValueType().getVectorElementType() == MVT::i64)
6211 PromotedVT = VT.changeVectorElementType(MVT::i64);
6212
6213 // Promote vector operands except for passthrough, which we know is either
6214 // undef or zero, and thus best constructed directly.
6215 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6216 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6217 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6218
6219 // A promoted result type forces the need for an extending load.
6220 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6221 ExtType = ISD::EXTLOAD;
6222
6223 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6224
6225 // Convert fixed length vector operands to scalable.
6226 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6227 Index = convertToScalableVector(DAG, ContainerVT, Index);
6229 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6230 : DAG.getConstant(0, DL, ContainerVT);
6231
6232 // Emit equivalent scalable vector gather.
6233 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6234 SDValue Load =
6235 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6236 Ops, MGT->getMemOperand(), IndexType, ExtType);
6237
6238 // Extract fixed length data then convert to the required result type.
6239 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6240 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6241 if (VT.isFloatingPoint())
6242 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6243
6244 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6245 }
6246
6247 // Everything else is legal.
6248 return Op;
6249}
6250
6251SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6252 SelectionDAG &DAG) const {
6253 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6254
6255 SDLoc DL(Op);
6256 SDValue Chain = MSC->getChain();
6257 SDValue StoreVal = MSC->getValue();
6258 SDValue Mask = MSC->getMask();
6259 SDValue BasePtr = MSC->getBasePtr();
6260 SDValue Index = MSC->getIndex();
6261 SDValue Scale = MSC->getScale();
6262 EVT VT = StoreVal.getValueType();
6263 EVT MemVT = MSC->getMemoryVT();
6264 ISD::MemIndexType IndexType = MSC->getIndexType();
6265 bool Truncating = MSC->isTruncatingStore();
6266
6267 bool IsScaled = MSC->isIndexScaled();
6268 bool IsSigned = MSC->isIndexSigned();
6269
6270 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6271 // must be calculated before hand.
6272 uint64_t ScaleVal = Scale->getAsZExtVal();
6273 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6274 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6275 EVT IndexVT = Index.getValueType();
6276 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6277 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6278 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6279
6280 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6281 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6282 MSC->getMemOperand(), IndexType, Truncating);
6283 }
6284
6285 // Lower fixed length scatter to a scalable equivalent.
6286 if (VT.isFixedLengthVector()) {
6287 assert(Subtarget->useSVEForFixedLengthVectors() &&
6288 "Cannot lower when not using SVE for fixed vectors!");
6289
6290 // Once bitcast we treat floating-point scatters as if integer.
6291 if (VT.isFloatingPoint()) {
6293 MemVT = MemVT.changeVectorElementTypeToInteger();
6294 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6295 }
6296
6297 // Find the smallest integer fixed length vector we can use for the scatter.
6298 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6299 if (VT.getVectorElementType() == MVT::i64 ||
6300 Index.getValueType().getVectorElementType() == MVT::i64 ||
6301 Mask.getValueType().getVectorElementType() == MVT::i64)
6302 PromotedVT = VT.changeVectorElementType(MVT::i64);
6303
6304 // Promote vector operands.
6305 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6306 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6307 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6308 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6309
6310 // A promoted value type forces the need for a truncating store.
6311 if (PromotedVT != VT)
6312 Truncating = true;
6313
6314 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6315
6316 // Convert fixed length vector operands to scalable.
6317 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6318 Index = convertToScalableVector(DAG, ContainerVT, Index);
6320 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6321
6322 // Emit equivalent scalable vector scatter.
6323 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6324 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6325 MSC->getMemOperand(), IndexType, Truncating);
6326 }
6327
6328 // Everything else is legal.
6329 return Op;
6330}
6331
6332SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6333 SDLoc DL(Op);
6334 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6335 assert(LoadNode && "Expected custom lowering of a masked load node");
6336 EVT VT = Op->getValueType(0);
6337
6338 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6339 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6340
6341 SDValue PassThru = LoadNode->getPassThru();
6342 SDValue Mask = LoadNode->getMask();
6343
6344 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6345 return Op;
6346
6348 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6349 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6350 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6351 LoadNode->getExtensionType());
6352
6353 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6354
6355 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6356}
6357
6358// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6360 EVT VT, EVT MemVT,
6361 SelectionDAG &DAG) {
6362 assert(VT.isVector() && "VT should be a vector type");
6363 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6364
6365 SDValue Value = ST->getValue();
6366
6367 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6368 // the word lane which represent the v4i8 subvector. It optimizes the store
6369 // to:
6370 //
6371 // xtn v0.8b, v0.8h
6372 // str s0, [x0]
6373
6374 SDValue Undef = DAG.getUNDEF(MVT::i16);
6375 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6376 {Undef, Undef, Undef, Undef});
6377
6378 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6379 Value, UndefVec);
6380 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6381
6382 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6383 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6384 Trunc, DAG.getConstant(0, DL, MVT::i64));
6385
6386 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6387 ST->getBasePtr(), ST->getMemOperand());
6388}
6389
6390// Custom lowering for any store, vector or scalar and/or default or with
6391// a truncate operations. Currently only custom lower truncate operation
6392// from vector v4i16 to v4i8 or volatile stores of i128.
6393SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6394 SelectionDAG &DAG) const {
6395 SDLoc Dl(Op);
6396 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6397 assert (StoreNode && "Can only custom lower store nodes");
6398
6399 SDValue Value = StoreNode->getValue();
6400
6401 EVT VT = Value.getValueType();
6402 EVT MemVT = StoreNode->getMemoryVT();
6403
6404 if (VT.isVector()) {
6406 VT,
6407 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6408 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6409
6410 unsigned AS = StoreNode->getAddressSpace();
6411 Align Alignment = StoreNode->getAlign();
6412 if (Alignment < MemVT.getStoreSize() &&
6413 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6414 StoreNode->getMemOperand()->getFlags(),
6415 nullptr)) {
6416 return scalarizeVectorStore(StoreNode, DAG);
6417 }
6418
6419 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6420 MemVT == MVT::v4i8) {
6421 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6422 }
6423 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6424 // the custom lowering, as there are no un-paired non-temporal stores and
6425 // legalization will break up 256 bit inputs.
6427 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6428 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6429 (MemVT.getScalarSizeInBits() == 8u ||
6430 MemVT.getScalarSizeInBits() == 16u ||
6431 MemVT.getScalarSizeInBits() == 32u ||
6432 MemVT.getScalarSizeInBits() == 64u)) {
6433 SDValue Lo =
6436 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6437 SDValue Hi =
6440 StoreNode->getValue(),
6441 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6443 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6444 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6445 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6446 return Result;
6447 }
6448 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6449 return LowerStore128(Op, DAG);
6450 } else if (MemVT == MVT::i64x8) {
6451 SDValue Value = StoreNode->getValue();
6452 assert(Value->getValueType(0) == MVT::i64x8);
6453 SDValue Chain = StoreNode->getChain();
6454 SDValue Base = StoreNode->getBasePtr();
6455 EVT PtrVT = Base.getValueType();
6456 for (unsigned i = 0; i < 8; i++) {
6457 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6458 Value, DAG.getConstant(i, Dl, MVT::i32));
6459 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6460 DAG.getConstant(i * 8, Dl, PtrVT));
6461 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6462 StoreNode->getOriginalAlign());
6463 }
6464 return Chain;
6465 }
6466
6467 return SDValue();
6468}
6469
6470/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6471SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6472 SelectionDAG &DAG) const {
6473 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6474 assert(StoreNode->getMemoryVT() == MVT::i128);
6475 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6476
6477 bool IsStoreRelease =
6479 if (StoreNode->isAtomic())
6480 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6481 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6484
6485 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6486 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6487 ? StoreNode->getOperand(1)
6488 : StoreNode->getOperand(2);
6489 SDLoc DL(Op);
6490 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6491 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6492 if (DAG.getDataLayout().isBigEndian())
6493 std::swap(StoreValue.first, StoreValue.second);
6495 Opcode, DL, DAG.getVTList(MVT::Other),
6496 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6497 StoreNode->getBasePtr()},
6498 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6499 return Result;
6500}
6501
6502SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6503 SelectionDAG &DAG) const {
6504 SDLoc DL(Op);
6505 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6506 assert(LoadNode && "Expected custom lowering of a load node");
6507
6508 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6510 SDValue Base = LoadNode->getBasePtr();
6511 SDValue Chain = LoadNode->getChain();
6512 EVT PtrVT = Base.getValueType();
6513 for (unsigned i = 0; i < 8; i++) {
6514 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6515 DAG.getConstant(i * 8, DL, PtrVT));
6516 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6517 LoadNode->getPointerInfo(),
6518 LoadNode->getOriginalAlign());
6519 Ops.push_back(Part);
6520 Chain = SDValue(Part.getNode(), 1);
6521 }
6522 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6523 return DAG.getMergeValues({Loaded, Chain}, DL);
6524 }
6525
6526 // Custom lowering for extending v4i8 vector loads.
6527 EVT VT = Op->getValueType(0);
6528 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6529
6530 if (LoadNode->getMemoryVT() != MVT::v4i8)
6531 return SDValue();
6532
6533 // Avoid generating unaligned loads.
6534 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
6535 return SDValue();
6536
6537 unsigned ExtType;
6538 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6539 ExtType = ISD::SIGN_EXTEND;
6540 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6541 LoadNode->getExtensionType() == ISD::EXTLOAD)
6542 ExtType = ISD::ZERO_EXTEND;
6543 else
6544 return SDValue();
6545
6546 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6547 LoadNode->getBasePtr(), MachinePointerInfo());
6548 SDValue Chain = Load.getValue(1);
6549 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6550 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6551 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6552 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6553 DAG.getConstant(0, DL, MVT::i64));
6554 if (VT == MVT::v4i32)
6555 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6556 return DAG.getMergeValues({Ext, Chain}, DL);
6557}
6558
6559// Generate SUBS and CSEL for integer abs.
6560SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6561 MVT VT = Op.getSimpleValueType();
6562
6563 if (VT.isVector())
6564 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6565
6566 SDLoc DL(Op);
6567 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6568 Op.getOperand(0));
6569 // Generate SUBS & CSEL.
6570 SDValue Cmp =
6571 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6572 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6573 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6574 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6575 Cmp.getValue(1));
6576}
6577
6579 SDValue Chain = Op.getOperand(0);
6580 SDValue Cond = Op.getOperand(1);
6581 SDValue Dest = Op.getOperand(2);
6582
6584 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6585 SDLoc dl(Op);
6586 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6587 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6588 Cmp);
6589 }
6590
6591 return SDValue();
6592}
6593
6594// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6595// FSHL is converted to FSHR before deciding what to do with it
6597 SDValue Shifts = Op.getOperand(2);
6598 // Check if the shift amount is a constant
6599 // If opcode is FSHL, convert it to FSHR
6600 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6601 SDLoc DL(Op);
6602 MVT VT = Op.getSimpleValueType();
6603
6604 if (Op.getOpcode() == ISD::FSHL) {
6605 unsigned int NewShiftNo =
6606 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6607 return DAG.getNode(
6608 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6609 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6610 } else if (Op.getOpcode() == ISD::FSHR) {
6611 return Op;
6612 }
6613 }
6614
6615 return SDValue();
6616}
6617
6619 SDValue X = Op.getOperand(0);
6620 EVT XScalarTy = X.getValueType();
6621 SDValue Exp = Op.getOperand(1);
6622
6623 SDLoc DL(Op);
6624 EVT XVT, ExpVT;
6625 switch (Op.getSimpleValueType().SimpleTy) {
6626 default:
6627 return SDValue();
6628 case MVT::bf16:
6629 case MVT::f16:
6630 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6631 [[fallthrough]];
6632 case MVT::f32:
6633 XVT = MVT::nxv4f32;
6634 ExpVT = MVT::nxv4i32;
6635 break;
6636 case MVT::f64:
6637 XVT = MVT::nxv2f64;
6638 ExpVT = MVT::nxv2i64;
6639 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6640 break;
6641 }
6642
6643 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6644 SDValue VX =
6645 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6646 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6647 DAG.getUNDEF(ExpVT), Exp, Zero);
6648 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6649 AArch64SVEPredPattern::all);
6650 SDValue FScale =
6652 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6653 VPg, VX, VExp);
6654 SDValue Final =
6655 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6656 if (X.getValueType() != XScalarTy)
6657 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6658 DAG.getIntPtrConstant(1, SDLoc(Op)));
6659 return Final;
6660}
6661
6663 SelectionDAG &DAG) const {
6664 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6665 LLVM_DEBUG(Op.dump());
6666
6667 switch (Op.getOpcode()) {
6668 default:
6669 llvm_unreachable("unimplemented operand");
6670 return SDValue();
6671 case ISD::BITCAST:
6672 return LowerBITCAST(Op, DAG);
6673 case ISD::GlobalAddress:
6674 return LowerGlobalAddress(Op, DAG);
6676 return LowerGlobalTLSAddress(Op, DAG);
6678 return LowerPtrAuthGlobalAddress(Op, DAG);
6679 case ISD::SETCC:
6680 case ISD::STRICT_FSETCC:
6682 return LowerSETCC(Op, DAG);
6683 case ISD::SETCCCARRY:
6684 return LowerSETCCCARRY(Op, DAG);
6685 case ISD::BRCOND:
6686 return LowerBRCOND(Op, DAG);
6687 case ISD::BR_CC:
6688 return LowerBR_CC(Op, DAG);
6689 case ISD::SELECT:
6690 return LowerSELECT(Op, DAG);
6691 case ISD::SELECT_CC:
6692 return LowerSELECT_CC(Op, DAG);
6693 case ISD::JumpTable:
6694 return LowerJumpTable(Op, DAG);
6695 case ISD::BR_JT:
6696 return LowerBR_JT(Op, DAG);
6697 case ISD::ConstantPool:
6698 return LowerConstantPool(Op, DAG);
6699 case ISD::BlockAddress:
6700 return LowerBlockAddress(Op, DAG);
6701 case ISD::VASTART:
6702 return LowerVASTART(Op, DAG);
6703 case ISD::VACOPY:
6704 return LowerVACOPY(Op, DAG);
6705 case ISD::VAARG:
6706 return LowerVAARG(Op, DAG);
6707 case ISD::UADDO_CARRY:
6708 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6709 case ISD::USUBO_CARRY:
6710 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6711 case ISD::SADDO_CARRY:
6712 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6713 case ISD::SSUBO_CARRY:
6714 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6715 case ISD::SADDO:
6716 case ISD::UADDO:
6717 case ISD::SSUBO:
6718 case ISD::USUBO:
6719 case ISD::SMULO:
6720 case ISD::UMULO:
6721 return LowerXALUO(Op, DAG);
6722 case ISD::FADD:
6723 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6724 case ISD::FSUB:
6725 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6726 case ISD::FMUL:
6727 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6728 case ISD::FMA:
6729 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6730 case ISD::FDIV:
6731 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6732 case ISD::FNEG:
6733 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6734 case ISD::FCEIL:
6735 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6736 case ISD::FFLOOR:
6737 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6738 case ISD::FNEARBYINT:
6739 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6740 case ISD::FRINT:
6741 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6742 case ISD::FROUND:
6743 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6744 case ISD::FROUNDEVEN:
6745 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6746 case ISD::FTRUNC:
6747 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6748 case ISD::FSQRT:
6749 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6750 case ISD::FABS:
6751 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6752 case ISD::FP_ROUND:
6754 return LowerFP_ROUND(Op, DAG);
6755 case ISD::FP_EXTEND:
6756 return LowerFP_EXTEND(Op, DAG);
6757 case ISD::FRAMEADDR:
6758 return LowerFRAMEADDR(Op, DAG);
6759 case ISD::SPONENTRY:
6760 return LowerSPONENTRY(Op, DAG);
6761 case ISD::RETURNADDR:
6762 return LowerRETURNADDR(Op, DAG);
6764 return LowerADDROFRETURNADDR(Op, DAG);
6766 return LowerCONCAT_VECTORS(Op, DAG);
6768 return LowerINSERT_VECTOR_ELT(Op, DAG);
6770 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6771 case ISD::BUILD_VECTOR:
6772 return LowerBUILD_VECTOR(Op, DAG);
6774 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6776 return LowerVECTOR_SHUFFLE(Op, DAG);
6777 case ISD::SPLAT_VECTOR:
6778 return LowerSPLAT_VECTOR(Op, DAG);
6780 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6782 return LowerINSERT_SUBVECTOR(Op, DAG);
6783 case ISD::SDIV:
6784 case ISD::UDIV:
6785 return LowerDIV(Op, DAG);
6786 case ISD::SMIN:
6787 case ISD::UMIN:
6788 case ISD::SMAX:
6789 case ISD::UMAX:
6790 return LowerMinMax(Op, DAG);
6791 case ISD::SRA:
6792 case ISD::SRL:
6793 case ISD::SHL:
6794 return LowerVectorSRA_SRL_SHL(Op, DAG);
6795 case ISD::SHL_PARTS:
6796 case ISD::SRL_PARTS:
6797 case ISD::SRA_PARTS:
6798 return LowerShiftParts(Op, DAG);
6799 case ISD::CTPOP:
6800 case ISD::PARITY:
6801 return LowerCTPOP_PARITY(Op, DAG);
6802 case ISD::FCOPYSIGN:
6803 return LowerFCOPYSIGN(Op, DAG);
6804 case ISD::OR:
6805 return LowerVectorOR(Op, DAG);
6806 case ISD::XOR:
6807 return LowerXOR(Op, DAG);
6808 case ISD::PREFETCH:
6809 return LowerPREFETCH(Op, DAG);
6810 case ISD::SINT_TO_FP:
6811 case ISD::UINT_TO_FP:
6814 return LowerINT_TO_FP(Op, DAG);
6815 case ISD::FP_TO_SINT:
6816 case ISD::FP_TO_UINT:
6819 return LowerFP_TO_INT(Op, DAG);
6822 return LowerFP_TO_INT_SAT(Op, DAG);
6823 case ISD::FSINCOS:
6824 return LowerFSINCOS(Op, DAG);
6825 case ISD::GET_ROUNDING:
6826 return LowerGET_ROUNDING(Op, DAG);
6827 case ISD::SET_ROUNDING:
6828 return LowerSET_ROUNDING(Op, DAG);
6829 case ISD::GET_FPMODE:
6830 return LowerGET_FPMODE(Op, DAG);
6831 case ISD::SET_FPMODE:
6832 return LowerSET_FPMODE(Op, DAG);
6833 case ISD::RESET_FPMODE:
6834 return LowerRESET_FPMODE(Op, DAG);
6835 case ISD::MUL:
6836 return LowerMUL(Op, DAG);
6837 case ISD::MULHS:
6838 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6839 case ISD::MULHU:
6840 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6842 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6844 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6846 return LowerINTRINSIC_VOID(Op, DAG);
6847 case ISD::ATOMIC_STORE:
6848 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6849 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6850 return LowerStore128(Op, DAG);
6851 }
6852 return SDValue();
6853 case ISD::STORE:
6854 return LowerSTORE(Op, DAG);
6855 case ISD::MSTORE:
6856 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6857 case ISD::MGATHER:
6858 return LowerMGATHER(Op, DAG);
6859 case ISD::MSCATTER:
6860 return LowerMSCATTER(Op, DAG);
6862 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6863 case ISD::VECREDUCE_ADD:
6864 case ISD::VECREDUCE_AND:
6865 case ISD::VECREDUCE_OR:
6866 case ISD::VECREDUCE_XOR:
6876 return LowerVECREDUCE(Op, DAG);
6878 return LowerATOMIC_LOAD_AND(Op, DAG);
6880 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6881 case ISD::VSCALE:
6882 return LowerVSCALE(Op, DAG);
6883 case ISD::ANY_EXTEND:
6884 case ISD::SIGN_EXTEND:
6885 case ISD::ZERO_EXTEND:
6886 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6888 // Only custom lower when ExtraVT has a legal byte based element type.
6889 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6890 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6891 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6892 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6893 return SDValue();
6894
6895 return LowerToPredicatedOp(Op, DAG,
6897 }
6898 case ISD::TRUNCATE:
6899 return LowerTRUNCATE(Op, DAG);
6900 case ISD::MLOAD:
6901 return LowerMLOAD(Op, DAG);
6902 case ISD::LOAD:
6903 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6904 !Subtarget->isNeonAvailable()))
6905 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6906 return LowerLOAD(Op, DAG);
6907 case ISD::ADD:
6908 case ISD::AND:
6909 case ISD::SUB:
6910 return LowerToScalableOp(Op, DAG);
6911 case ISD::FMAXIMUM:
6912 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6913 case ISD::FMAXNUM:
6914 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6915 case ISD::FMINIMUM:
6916 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6917 case ISD::FMINNUM:
6918 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6919 case ISD::VSELECT:
6920 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6921 case ISD::ABS:
6922 return LowerABS(Op, DAG);
6923 case ISD::ABDS:
6924 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6925 case ISD::ABDU:
6926 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6927 case ISD::AVGFLOORS:
6928 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6929 case ISD::AVGFLOORU:
6930 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6931 case ISD::AVGCEILS:
6932 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6933 case ISD::AVGCEILU:
6934 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6935 case ISD::BITREVERSE:
6936 return LowerBitreverse(Op, DAG);
6937 case ISD::BSWAP:
6938 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6939 case ISD::CTLZ:
6940 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6941 case ISD::CTTZ:
6942 return LowerCTTZ(Op, DAG);
6943 case ISD::VECTOR_SPLICE:
6944 return LowerVECTOR_SPLICE(Op, DAG);
6946 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6948 return LowerVECTOR_INTERLEAVE(Op, DAG);
6949 case ISD::LRINT:
6950 case ISD::LLRINT:
6951 if (Op.getValueType().isVector())
6952 return LowerVectorXRINT(Op, DAG);
6953 [[fallthrough]];
6954 case ISD::LROUND:
6955 case ISD::LLROUND: {
6956 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6957 Op.getOperand(0).getValueType() == MVT::bf16) &&
6958 "Expected custom lowering of rounding operations only for f16");
6959 SDLoc DL(Op);
6960 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6961 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6962 }
6963 case ISD::STRICT_LROUND:
6965 case ISD::STRICT_LRINT:
6966 case ISD::STRICT_LLRINT: {
6967 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6968 Op.getOperand(1).getValueType() == MVT::bf16) &&
6969 "Expected custom lowering of rounding operations only for f16");
6970 SDLoc DL(Op);
6971 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6972 {Op.getOperand(0), Op.getOperand(1)});
6973 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6974 {Ext.getValue(1), Ext.getValue(0)});
6975 }
6976 case ISD::WRITE_REGISTER: {
6977 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6978 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6979 SDLoc DL(Op);
6980
6981 SDValue Chain = Op.getOperand(0);
6982 SDValue SysRegName = Op.getOperand(1);
6983 std::pair<SDValue, SDValue> Pair =
6984 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6985
6986 // chain = MSRR(chain, sysregname, lo, hi)
6987 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6988 SysRegName, Pair.first, Pair.second);
6989
6990 return Result;
6991 }
6992 case ISD::FSHL:
6993 case ISD::FSHR:
6994 return LowerFunnelShift(Op, DAG);
6995 case ISD::FLDEXP:
6996 return LowerFLDEXP(Op, DAG);
6998 return LowerVECTOR_HISTOGRAM(Op, DAG);
6999 }
7000}
7001
7003 return !Subtarget->useSVEForFixedLengthVectors();
7004}
7005
7007 EVT VT, bool OverrideNEON) const {
7008 if (!VT.isFixedLengthVector() || !VT.isSimple())
7009 return false;
7010
7011 // Don't use SVE for vectors we cannot scalarize if required.
7012 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7013 // Fixed length predicates should be promoted to i8.
7014 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7015 case MVT::i1:
7016 default:
7017 return false;
7018 case MVT::i8:
7019 case MVT::i16:
7020 case MVT::i32:
7021 case MVT::i64:
7022 case MVT::f16:
7023 case MVT::f32:
7024 case MVT::f64:
7025 break;
7026 }
7027
7028 // NEON-sized vectors can be emulated using SVE instructions.
7029 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7030 return Subtarget->isSVEorStreamingSVEAvailable();
7031
7032 // Ensure NEON MVTs only belong to a single register class.
7033 if (VT.getFixedSizeInBits() <= 128)
7034 return false;
7035
7036 // Ensure wider than NEON code generation is enabled.
7037 if (!Subtarget->useSVEForFixedLengthVectors())
7038 return false;
7039
7040 // Don't use SVE for types that don't fit.
7041 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7042 return false;
7043
7044 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7045 // the base fixed length SVE support in place.
7046 if (!VT.isPow2VectorType())
7047 return false;
7048
7049 return true;
7050}
7051
7052//===----------------------------------------------------------------------===//
7053// Calling Convention Implementation
7054//===----------------------------------------------------------------------===//
7055
7056static unsigned getIntrinsicID(const SDNode *N) {
7057 unsigned Opcode = N->getOpcode();
7058 switch (Opcode) {
7059 default:
7062 unsigned IID = N->getConstantOperandVal(0);
7063 if (IID < Intrinsic::num_intrinsics)
7064 return IID;
7066 }
7067 }
7068}
7069
7071 SDValue N1) const {
7072 if (!N0.hasOneUse())
7073 return false;
7074
7075 unsigned IID = getIntrinsicID(N1.getNode());
7076 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7077 if (IID == Intrinsic::aarch64_neon_umull ||
7078 N1.getOpcode() == AArch64ISD::UMULL ||
7079 IID == Intrinsic::aarch64_neon_smull ||
7081 return N0.getOpcode() != ISD::ADD;
7082
7083 return true;
7084}
7085
7086/// Selects the correct CCAssignFn for a given CallingConvention value.
7088 bool IsVarArg) const {
7089 switch (CC) {
7090 default:
7091 report_fatal_error("Unsupported calling convention.");
7092 case CallingConv::GHC:
7093 return CC_AArch64_GHC;
7096 case CallingConv::C:
7097 case CallingConv::Fast:
7101 case CallingConv::Swift:
7103 case CallingConv::Tail:
7104 case CallingConv::GRAAL:
7105 if (Subtarget->isTargetWindows()) {
7106 if (IsVarArg) {
7107 if (Subtarget->isWindowsArm64EC())
7110 }
7111 return CC_AArch64_Win64PCS;
7112 }
7113 if (!Subtarget->isTargetDarwin())
7114 return CC_AArch64_AAPCS;
7115 if (!IsVarArg)
7116 return CC_AArch64_DarwinPCS;
7119 case CallingConv::Win64:
7120 if (IsVarArg) {
7121 if (Subtarget->isWindowsArm64EC())
7124 }
7125 return CC_AArch64_Win64PCS;
7127 if (Subtarget->isWindowsArm64EC())
7134 return CC_AArch64_AAPCS;
7139 }
7140}
7141
7142CCAssignFn *
7144 switch (CC) {
7145 default:
7146 return RetCC_AArch64_AAPCS;
7150 if (Subtarget->isWindowsArm64EC())
7152 return RetCC_AArch64_AAPCS;
7153 }
7154}
7155
7156static bool isPassedInFPR(EVT VT) {
7157 return VT.isFixedLengthVector() ||
7158 (VT.isFloatingPoint() && !VT.isScalableVector());
7159}
7160
7161SDValue AArch64TargetLowering::LowerFormalArguments(
7162 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7163 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7164 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7166 const Function &F = MF.getFunction();
7167 MachineFrameInfo &MFI = MF.getFrameInfo();
7168 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
7169 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7170 (isVarArg && Subtarget->isWindowsArm64EC());
7172
7174 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7176 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7177 FuncInfo->setIsSVECC(true);
7178
7179 // Assign locations to all of the incoming arguments.
7181 DenseMap<unsigned, SDValue> CopiedRegs;
7182 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7183
7184 // At this point, Ins[].VT may already be promoted to i32. To correctly
7185 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7186 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7187 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7188 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7189 // LocVT.
7190 unsigned NumArgs = Ins.size();
7191 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7192 unsigned CurArgIdx = 0;
7193 for (unsigned i = 0; i != NumArgs; ++i) {
7194 MVT ValVT = Ins[i].VT;
7195 if (Ins[i].isOrigArg()) {
7196 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7197 CurArgIdx = Ins[i].getOrigArgIndex();
7198
7199 // Get type of the original argument.
7200 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7201 /*AllowUnknown*/ true);
7202 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7203 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7204 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7205 ValVT = MVT::i8;
7206 else if (ActualMVT == MVT::i16)
7207 ValVT = MVT::i16;
7208 }
7209 bool UseVarArgCC = false;
7210 if (IsWin64)
7211 UseVarArgCC = isVarArg;
7212 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7213 bool Res =
7214 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7215 assert(!Res && "Call operand has unhandled type");
7216 (void)Res;
7217 }
7218
7220 bool IsLocallyStreaming =
7221 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7222 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7223 SDValue Glue = Chain.getValue(1);
7224
7225 SmallVector<SDValue, 16> ArgValues;
7226 unsigned ExtraArgLocs = 0;
7227 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7228 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7229
7230 if (Ins[i].Flags.isByVal()) {
7231 // Byval is used for HFAs in the PCS, but the system should work in a
7232 // non-compliant manner for larger structs.
7233 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7234 int Size = Ins[i].Flags.getByValSize();
7235 unsigned NumRegs = (Size + 7) / 8;
7236
7237 // FIXME: This works on big-endian for composite byvals, which are the common
7238 // case. It should also work for fundamental types too.
7239 unsigned FrameIdx =
7240 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7241 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7242 InVals.push_back(FrameIdxN);
7243
7244 continue;
7245 }
7246
7247 if (Ins[i].Flags.isSwiftAsync())
7249
7250 SDValue ArgValue;
7251 if (VA.isRegLoc()) {
7252 // Arguments stored in registers.
7253 EVT RegVT = VA.getLocVT();
7254 const TargetRegisterClass *RC;
7255
7256 if (RegVT == MVT::i32)
7257 RC = &AArch64::GPR32RegClass;
7258 else if (RegVT == MVT::i64)
7259 RC = &AArch64::GPR64RegClass;
7260 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7261 RC = &AArch64::FPR16RegClass;
7262 else if (RegVT == MVT::f32)
7263 RC = &AArch64::FPR32RegClass;
7264 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7265 RC = &AArch64::FPR64RegClass;
7266 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7267 RC = &AArch64::FPR128RegClass;
7268 else if (RegVT.isScalableVector() &&
7269 RegVT.getVectorElementType() == MVT::i1) {
7270 FuncInfo->setIsSVECC(true);
7271 RC = &AArch64::PPRRegClass;
7272 } else if (RegVT == MVT::aarch64svcount) {
7273 FuncInfo->setIsSVECC(true);
7274 RC = &AArch64::PPRRegClass;
7275 } else if (RegVT.isScalableVector()) {
7276 FuncInfo->setIsSVECC(true);
7277 RC = &AArch64::ZPRRegClass;
7278 } else
7279 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7280
7281 // Transform the arguments in physical registers into virtual ones.
7282 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7283
7284 if (IsLocallyStreaming) {
7285 // LocallyStreamingFunctions must insert the SMSTART in the correct
7286 // position, so we use Glue to ensure no instructions can be scheduled
7287 // between the chain of:
7288 // t0: ch,glue = EntryNode
7289 // t1: res,ch,glue = CopyFromReg
7290 // ...
7291 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7292 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7293 // ^^^^^^
7294 // This will be the new Chain/Root node.
7295 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7296 Glue = ArgValue.getValue(2);
7297 if (isPassedInFPR(ArgValue.getValueType())) {
7298 ArgValue =
7300 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7301 {ArgValue, Glue});
7302 Glue = ArgValue.getValue(1);
7303 }
7304 } else
7305 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7306
7307 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7308 // to 64 bits. Insert an assert[sz]ext to capture this, then
7309 // truncate to the right size.
7310 switch (VA.getLocInfo()) {
7311 default:
7312 llvm_unreachable("Unknown loc info!");
7313 case CCValAssign::Full:
7314 break;
7316 assert(
7317 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7318 "Indirect arguments should be scalable on most subtargets");
7319 break;
7320 case CCValAssign::BCvt:
7321 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7322 break;
7323 case CCValAssign::AExt:
7324 case CCValAssign::SExt:
7325 case CCValAssign::ZExt:
7326 break;
7328 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7329 DAG.getConstant(32, DL, RegVT));
7330 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7331 break;
7332 }
7333 } else { // VA.isRegLoc()
7334 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7335 unsigned ArgOffset = VA.getLocMemOffset();
7336 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7337 ? VA.getLocVT().getSizeInBits()
7338 : VA.getValVT().getSizeInBits()) / 8;
7339
7340 uint32_t BEAlign = 0;
7341 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7342 !Ins[i].Flags.isInConsecutiveRegs())
7343 BEAlign = 8 - ArgSize;
7344
7345 SDValue FIN;
7346 MachinePointerInfo PtrInfo;
7347 if (StackViaX4) {
7348 // In both the ARM64EC varargs convention and the thunk convention,
7349 // arguments on the stack are accessed relative to x4, not sp. In
7350 // the thunk convention, there's an additional offset of 32 bytes
7351 // to account for the shadow store.
7352 unsigned ObjOffset = ArgOffset + BEAlign;
7353 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7354 ObjOffset += 32;
7355 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7356 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7357 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7358 DAG.getConstant(ObjOffset, DL, MVT::i64));
7360 } else {
7361 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7362
7363 // Create load nodes to retrieve arguments from the stack.
7364 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7365 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7366 }
7367
7368 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7370 MVT MemVT = VA.getValVT();
7371
7372 switch (VA.getLocInfo()) {
7373 default:
7374 break;
7375 case CCValAssign::Trunc:
7376 case CCValAssign::BCvt:
7377 MemVT = VA.getLocVT();
7378 break;
7381 Subtarget->isWindowsArm64EC()) &&
7382 "Indirect arguments should be scalable on most subtargets");
7383 MemVT = VA.getLocVT();
7384 break;
7385 case CCValAssign::SExt:
7386 ExtType = ISD::SEXTLOAD;
7387 break;
7388 case CCValAssign::ZExt:
7389 ExtType = ISD::ZEXTLOAD;
7390 break;
7391 case CCValAssign::AExt:
7392 ExtType = ISD::EXTLOAD;
7393 break;
7394 }
7395
7396 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7397 MemVT);
7398 }
7399
7400 if (VA.getLocInfo() == CCValAssign::Indirect) {
7401 assert((VA.getValVT().isScalableVT() ||
7402 Subtarget->isWindowsArm64EC()) &&
7403 "Indirect arguments should be scalable on most subtargets");
7404
7405 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7406 unsigned NumParts = 1;
7407 if (Ins[i].Flags.isInConsecutiveRegs()) {
7408 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7409 ++NumParts;
7410 }
7411
7412 MVT PartLoad = VA.getValVT();
7413 SDValue Ptr = ArgValue;
7414
7415 // Ensure we generate all loads for each tuple part, whilst updating the
7416 // pointer after each load correctly using vscale.
7417 while (NumParts > 0) {
7418 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7419 InVals.push_back(ArgValue);
7420 NumParts--;
7421 if (NumParts > 0) {
7422 SDValue BytesIncrement;
7423 if (PartLoad.isScalableVector()) {
7424 BytesIncrement = DAG.getVScale(
7425 DL, Ptr.getValueType(),
7426 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7427 } else {
7428 BytesIncrement = DAG.getConstant(
7429 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7430 Ptr.getValueType());
7431 }
7433 Flags.setNoUnsignedWrap(true);
7434 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7435 BytesIncrement, Flags);
7436 ExtraArgLocs++;
7437 i++;
7438 }
7439 }
7440 } else {
7441 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7442 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7443 ArgValue, DAG.getValueType(MVT::i32));
7444
7445 // i1 arguments are zero-extended to i8 by the caller. Emit a
7446 // hint to reflect this.
7447 if (Ins[i].isOrigArg()) {
7448 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7449 if (OrigArg->getType()->isIntegerTy(1)) {
7450 if (!Ins[i].Flags.isZExt()) {
7451 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7452 ArgValue.getValueType(), ArgValue);
7453 }
7454 }
7455 }
7456
7457 InVals.push_back(ArgValue);
7458 }
7459 }
7460 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7461
7462 // Insert the SMSTART if this is a locally streaming function and
7463 // make sure it is Glued to the last CopyFromReg value.
7464 if (IsLocallyStreaming) {
7465 SDValue PStateSM;
7466 if (Attrs.hasStreamingCompatibleInterface()) {
7467 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7470 FuncInfo->setPStateSMReg(Reg);
7471 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7472 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7474 } else
7475 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7477
7478 // Ensure that the SMSTART happens after the CopyWithChain such that its
7479 // chain result is used.
7480 for (unsigned I=0; I<InVals.size(); ++I) {
7482 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7483 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7484 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7485 InVals[I].getValueType());
7486 }
7487 }
7488
7489 // varargs
7490 if (isVarArg) {
7491 if (!Subtarget->isTargetDarwin() || IsWin64) {
7492 // The AAPCS variadic function ABI is identical to the non-variadic
7493 // one. As a result there may be more arguments in registers and we should
7494 // save them for future reference.
7495 // Win64 variadic functions also pass arguments in registers, but all float
7496 // arguments are passed in integer registers.
7497 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7498 }
7499
7500 // This will point to the next argument passed via stack.
7501 unsigned VarArgsOffset = CCInfo.getStackSize();
7502 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7503 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7504 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7505 FuncInfo->setVarArgsStackIndex(
7506 MFI.CreateFixedObject(4, VarArgsOffset, true));
7507
7508 if (MFI.hasMustTailInVarArgFunc()) {
7509 SmallVector<MVT, 2> RegParmTypes;
7510 RegParmTypes.push_back(MVT::i64);
7511 RegParmTypes.push_back(MVT::f128);
7512 // Compute the set of forwarded registers. The rest are scratch.
7514 FuncInfo->getForwardedMustTailRegParms();
7515 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7517
7518 // Conservatively forward X8, since it might be used for aggregate return.
7519 if (!CCInfo.isAllocated(AArch64::X8)) {
7520 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7521 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7522 }
7523 }
7524 }
7525
7526 // On Windows, InReg pointers must be returned, so record the pointer in a
7527 // virtual register at the start of the function so it can be returned in the
7528 // epilogue.
7529 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7530 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7531 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7532 Ins[I].Flags.isInReg()) &&
7533 Ins[I].Flags.isSRet()) {
7534 assert(!FuncInfo->getSRetReturnReg());
7535
7536 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7537 Register Reg =
7539 FuncInfo->setSRetReturnReg(Reg);
7540
7541 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7542 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7543 break;
7544 }
7545 }
7546 }
7547
7548 unsigned StackArgSize = CCInfo.getStackSize();
7549 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7550 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7551 // This is a non-standard ABI so by fiat I say we're allowed to make full
7552 // use of the stack area to be popped, which must be aligned to 16 bytes in
7553 // any case:
7554 StackArgSize = alignTo(StackArgSize, 16);
7555
7556 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7557 // a multiple of 16.
7558 FuncInfo->setArgumentStackToRestore(StackArgSize);
7559
7560 // This realignment carries over to the available bytes below. Our own
7561 // callers will guarantee the space is free by giving an aligned value to
7562 // CALLSEQ_START.
7563 }
7564 // Even if we're not expected to free up the space, it's useful to know how
7565 // much is there while considering tail calls (because we can reuse it).
7566 FuncInfo->setBytesInStackArgArea(StackArgSize);
7567
7568 if (Subtarget->hasCustomCallingConv())
7570
7571 // Create a 16 Byte TPIDR2 object. The dynamic buffer
7572 // will be expanded and stored in the static object later using a pseudonode.
7573 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7574 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
7575 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
7576 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7577 DAG.getConstant(1, DL, MVT::i32));
7578
7579 SDValue Buffer;
7580 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
7582 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
7583 } else {
7584 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
7585 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
7586 DAG.getVTList(MVT::i64, MVT::Other),
7587 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
7588 MFI.CreateVariableSizedObject(Align(16), nullptr);
7589 }
7590 Chain = DAG.getNode(
7591 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
7592 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
7593 }
7594
7595 if (CallConv == CallingConv::PreserveNone) {
7596 for (const ISD::InputArg &I : Ins) {
7597 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
7598 I.Flags.isSwiftAsync()) {
7601 MF.getFunction(),
7602 "Swift attributes can't be used with preserve_none",
7603 DL.getDebugLoc()));
7604 break;
7605 }
7606 }
7607 }
7608
7609 return Chain;
7610}
7611
7612void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7613 SelectionDAG &DAG,
7614 const SDLoc &DL,
7615 SDValue &Chain) const {
7617 MachineFrameInfo &MFI = MF.getFrameInfo();
7619 auto PtrVT = getPointerTy(DAG.getDataLayout());
7620 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7621
7623
7625 unsigned NumGPRArgRegs = GPRArgRegs.size();
7626 if (Subtarget->isWindowsArm64EC()) {
7627 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7628 // functions.
7629 NumGPRArgRegs = 4;
7630 }
7631 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7632
7633 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7634 int GPRIdx = 0;
7635 if (GPRSaveSize != 0) {
7636 if (IsWin64) {
7637 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7638 if (GPRSaveSize & 15)
7639 // The extra size here, if triggered, will always be 8.
7640 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7641 } else
7642 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7643
7644 SDValue FIN;
7645 if (Subtarget->isWindowsArm64EC()) {
7646 // With the Arm64EC ABI, we reserve the save area as usual, but we
7647 // compute its address relative to x4. For a normal AArch64->AArch64
7648 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7649 // different address.
7650 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7651 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7652 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7653 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7654 } else {
7655 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7656 }
7657
7658 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7659 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7660 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7661 SDValue Store =
7662 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7664 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7665 : MachinePointerInfo::getStack(MF, i * 8));
7666 MemOps.push_back(Store);
7667 FIN =
7668 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7669 }
7670 }
7671 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7672 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7673
7674 if (Subtarget->hasFPARMv8() && !IsWin64) {
7676 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7677 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7678
7679 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7680 int FPRIdx = 0;
7681 if (FPRSaveSize != 0) {
7682 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7683
7684 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7685
7686 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7687 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7688 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7689
7690 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7691 MachinePointerInfo::getStack(MF, i * 16));
7692 MemOps.push_back(Store);
7693 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7694 DAG.getConstant(16, DL, PtrVT));
7695 }
7696 }
7697 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7698 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7699 }
7700
7701 if (!MemOps.empty()) {
7702 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7703 }
7704}
7705
7706/// LowerCallResult - Lower the result values of a call into the
7707/// appropriate copies out of appropriate physical registers.
7708SDValue AArch64TargetLowering::LowerCallResult(
7709 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7710 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7711 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7712 SDValue ThisVal, bool RequiresSMChange) const {
7713 DenseMap<unsigned, SDValue> CopiedRegs;
7714 // Copy all of the result registers out of their specified physreg.
7715 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7716 CCValAssign VA = RVLocs[i];
7717
7718 // Pass 'this' value directly from the argument to return value, to avoid
7719 // reg unit interference
7720 if (i == 0 && isThisReturn) {
7721 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7722 "unexpected return calling convention register assignment");
7723 InVals.push_back(ThisVal);
7724 continue;
7725 }
7726
7727 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7728 // allows one use of a physreg per block.
7729 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7730 if (!Val) {
7731 Val =
7732 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7733 Chain = Val.getValue(1);
7734 InGlue = Val.getValue(2);
7735 CopiedRegs[VA.getLocReg()] = Val;
7736 }
7737
7738 switch (VA.getLocInfo()) {
7739 default:
7740 llvm_unreachable("Unknown loc info!");
7741 case CCValAssign::Full:
7742 break;
7743 case CCValAssign::BCvt:
7744 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7745 break;
7747 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7748 DAG.getConstant(32, DL, VA.getLocVT()));
7749 [[fallthrough]];
7750 case CCValAssign::AExt:
7751 [[fallthrough]];
7752 case CCValAssign::ZExt:
7753 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7754 break;
7755 }
7756
7757 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7759 Val);
7760
7761 InVals.push_back(Val);
7762 }
7763
7764 return Chain;
7765}
7766
7767/// Return true if the calling convention is one that we can guarantee TCO for.
7768static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7769 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7771}
7772
7773/// Return true if we might ever do TCO for calls with this calling convention.
7775 switch (CC) {
7776 case CallingConv::C:
7781 case CallingConv::Swift:
7783 case CallingConv::Tail:
7784 case CallingConv::Fast:
7785 return true;
7786 default:
7787 return false;
7788 }
7789}
7790
7792 const AArch64Subtarget *Subtarget,
7794 CCState &CCInfo) {
7795 const SelectionDAG &DAG = CLI.DAG;
7796 CallingConv::ID CalleeCC = CLI.CallConv;
7797 bool IsVarArg = CLI.IsVarArg;
7798 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7799 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7800
7801 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7802 // for the shadow store.
7803 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7804 CCInfo.AllocateStack(32, Align(16));
7805
7806 unsigned NumArgs = Outs.size();
7807 for (unsigned i = 0; i != NumArgs; ++i) {
7808 MVT ArgVT = Outs[i].VT;
7809 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7810
7811 bool UseVarArgCC = false;
7812 if (IsVarArg) {
7813 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7814 // too, so use the vararg CC to force them to integer registers.
7815 if (IsCalleeWin64) {
7816 UseVarArgCC = true;
7817 } else {
7818 UseVarArgCC = !Outs[i].IsFixed;
7819 }
7820 }
7821
7822 if (!UseVarArgCC) {
7823 // Get type of the original argument.
7824 EVT ActualVT =
7825 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7826 /*AllowUnknown*/ true);
7827 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7828 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7829 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7830 ArgVT = MVT::i8;
7831 else if (ActualMVT == MVT::i16)
7832 ArgVT = MVT::i16;
7833 }
7834
7835 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7836 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7837 assert(!Res && "Call operand has unhandled type");
7838 (void)Res;
7839 }
7840}
7841
7842bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7843 const CallLoweringInfo &CLI) const {
7844 CallingConv::ID CalleeCC = CLI.CallConv;
7845 if (!mayTailCallThisCC(CalleeCC))
7846 return false;
7847
7848 SDValue Callee = CLI.Callee;
7849 bool IsVarArg = CLI.IsVarArg;
7850 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7851 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7852 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7853 const SelectionDAG &DAG = CLI.DAG;
7855 const Function &CallerF = MF.getFunction();
7856 CallingConv::ID CallerCC = CallerF.getCallingConv();
7857
7858 // SME Streaming functions are not eligible for TCO as they may require
7859 // the streaming mode or ZA to be restored after returning from the call.
7860 SMEAttrs CallerAttrs(MF.getFunction());
7861 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7862 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7863 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7864 CallerAttrs.hasStreamingBody())
7865 return false;
7866
7867 // Functions using the C or Fast calling convention that have an SVE signature
7868 // preserve more registers and should assume the SVE_VectorCall CC.
7869 // The check for matching callee-saved regs will determine whether it is
7870 // eligible for TCO.
7871 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7874
7875 bool CCMatch = CallerCC == CalleeCC;
7876
7877 // When using the Windows calling convention on a non-windows OS, we want
7878 // to back up and restore X18 in such functions; we can't do a tail call
7879 // from those functions.
7880 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7881 CalleeCC != CallingConv::Win64)
7882 return false;
7883
7884 // Byval parameters hand the function a pointer directly into the stack area
7885 // we want to reuse during a tail call. Working around this *is* possible (see
7886 // X86) but less efficient and uglier in LowerCall.
7887 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7888 e = CallerF.arg_end();
7889 i != e; ++i) {
7890 if (i->hasByValAttr())
7891 return false;
7892
7893 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7894 // In this case, it is necessary to save/restore X0 in the callee. Tail
7895 // call opt interferes with this. So we disable tail call opt when the
7896 // caller has an argument with "inreg" attribute.
7897
7898 // FIXME: Check whether the callee also has an "inreg" argument.
7899 if (i->hasInRegAttr())
7900 return false;
7901 }
7902
7903 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7904 return CCMatch;
7905
7906 // Externally-defined functions with weak linkage should not be
7907 // tail-called on AArch64 when the OS does not support dynamic
7908 // pre-emption of symbols, as the AAELF spec requires normal calls
7909 // to undefined weak functions to be replaced with a NOP or jump to the
7910 // next instruction. The behaviour of branch instructions in this
7911 // situation (as used for tail calls) is implementation-defined, so we
7912 // cannot rely on the linker replacing the tail call with a return.
7913 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7914 const GlobalValue *GV = G->getGlobal();
7916 if (GV->hasExternalWeakLinkage() &&
7917 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7918 return false;
7919 }
7920
7921 // Now we search for cases where we can use a tail call without changing the
7922 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7923 // concept.
7924
7925 // I want anyone implementing a new calling convention to think long and hard
7926 // about this assert.
7927 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7928 "Unexpected variadic calling convention");
7929
7930 LLVMContext &C = *DAG.getContext();
7931 // Check that the call results are passed in the same way.
7932 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7933 CCAssignFnForCall(CalleeCC, IsVarArg),
7934 CCAssignFnForCall(CallerCC, IsVarArg)))
7935 return false;
7936 // The callee has to preserve all registers the caller needs to preserve.
7937 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7938 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7939 if (!CCMatch) {
7940 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7941 if (Subtarget->hasCustomCallingConv()) {
7942 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7943 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7944 }
7945 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7946 return false;
7947 }
7948
7949 // Nothing more to check if the callee is taking no arguments
7950 if (Outs.empty())
7951 return true;
7952
7954 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7955
7956 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7957
7958 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7959 // When we are musttail, additional checks have been done and we can safely ignore this check
7960 // At least two cases here: if caller is fastcc then we can't have any
7961 // memory arguments (we'd be expected to clean up the stack afterwards). If
7962 // caller is C then we could potentially use its argument area.
7963
7964 // FIXME: for now we take the most conservative of these in both cases:
7965 // disallow all variadic memory operands.
7966 for (const CCValAssign &ArgLoc : ArgLocs)
7967 if (!ArgLoc.isRegLoc())
7968 return false;
7969 }
7970
7971 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7972
7973 // If any of the arguments is passed indirectly, it must be SVE, so the
7974 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7975 // allocate space on the stack. That is why we determine this explicitly here
7976 // the call cannot be a tailcall.
7977 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7978 assert((A.getLocInfo() != CCValAssign::Indirect ||
7979 A.getValVT().isScalableVector() ||
7980 Subtarget->isWindowsArm64EC()) &&
7981 "Expected value to be scalable");
7982 return A.getLocInfo() == CCValAssign::Indirect;
7983 }))
7984 return false;
7985
7986 // If the stack arguments for this call do not fit into our own save area then
7987 // the call cannot be made tail.
7988 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7989 return false;
7990
7991 const MachineRegisterInfo &MRI = MF.getRegInfo();
7992 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7993 return false;
7994
7995 return true;
7996}
7997
7998SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7999 SelectionDAG &DAG,
8000 MachineFrameInfo &MFI,
8001 int ClobberedFI) const {
8002 SmallVector<SDValue, 8> ArgChains;
8003 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8004 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8005
8006 // Include the original chain at the beginning of the list. When this is
8007 // used by target LowerCall hooks, this helps legalize find the
8008 // CALLSEQ_BEGIN node.
8009 ArgChains.push_back(Chain);
8010
8011 // Add a chain value for each stack argument corresponding
8012 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
8013 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8014 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8015 if (FI->getIndex() < 0) {
8016 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8017 int64_t InLastByte = InFirstByte;
8018 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8019
8020 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8021 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8022 ArgChains.push_back(SDValue(L, 1));
8023 }
8024
8025 // Build a tokenfactor for all the chains.
8026 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8027}
8028
8029bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8030 bool TailCallOpt) const {
8031 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8032 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8033}
8034
8035// Check if the value is zero-extended from i1 to i8
8036static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8037 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8038 if (SizeInBits < 8)
8039 return false;
8040
8041 APInt RequredZero(SizeInBits, 0xFE);
8042 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8043 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8044 return ZExtBool;
8045}
8046
8047void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8048 SDNode *Node) const {
8049 // Live-in physreg copies that are glued to SMSTART are applied as
8050 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8051 // register allocator to pass call args in callee saved regs, without extra
8052 // copies to avoid these fake clobbers of actually-preserved GPRs.
8053 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8054 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8055 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8056 if (MachineOperand &MO = MI.getOperand(I);
8057 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8058 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8059 AArch64::GPR64RegClass.contains(MO.getReg())))
8060 MI.removeOperand(I);
8061
8062 // The SVE vector length can change when entering/leaving streaming mode.
8063 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8064 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8065 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8066 /*IsImplicit=*/true));
8067 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8068 /*IsImplicit=*/true));
8069 }
8070 }
8071
8072 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8073 // have nothing to do with VG, were it not that they are used to materialise a
8074 // frame-address. If they contain a frame-index to a scalable vector, this
8075 // will likely require an ADDVL instruction to materialise the address, thus
8076 // reading VG.
8077 const MachineFunction &MF = *MI.getMF();
8079 (MI.getOpcode() == AArch64::ADDXri ||
8080 MI.getOpcode() == AArch64::SUBXri)) {
8081 const MachineOperand &MO = MI.getOperand(1);
8082 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8084 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8085 /*IsImplicit=*/true));
8086 }
8087}
8088
8090 bool Enable, SDValue Chain,
8091 SDValue InGlue,
8092 unsigned Condition,
8093 SDValue PStateSM) const {
8096 FuncInfo->setHasStreamingModeChanges(true);
8097
8098 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8099 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8100 SDValue MSROp =
8101 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8102 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8103 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8104 if (Condition != AArch64SME::Always) {
8105 assert(PStateSM && "PStateSM should be defined");
8106 Ops.push_back(PStateSM);
8107 }
8108 Ops.push_back(RegMask);
8109
8110 if (InGlue)
8111 Ops.push_back(InGlue);
8112
8113 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8114 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8115}
8116
8117static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8118 const SMEAttrs &CalleeAttrs) {
8119 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8120 CallerAttrs.hasStreamingBody())
8121 return AArch64SME::Always;
8122 if (CalleeAttrs.hasNonStreamingInterface())
8124 if (CalleeAttrs.hasStreamingInterface())
8126
8127 llvm_unreachable("Unsupported attributes");
8128}
8129
8130/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8131/// and add input and output parameter nodes.
8132SDValue
8133AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8134 SmallVectorImpl<SDValue> &InVals) const {
8135 SelectionDAG &DAG = CLI.DAG;
8136 SDLoc &DL = CLI.DL;
8137 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8138 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8140 SDValue Chain = CLI.Chain;
8141 SDValue Callee = CLI.Callee;
8142 bool &IsTailCall = CLI.IsTailCall;
8143 CallingConv::ID &CallConv = CLI.CallConv;
8144 bool IsVarArg = CLI.IsVarArg;
8145
8148 bool IsThisReturn = false;
8149
8151 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8152 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8153 bool IsSibCall = false;
8154 bool GuardWithBTI = false;
8155
8156 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8157 !Subtarget->noBTIAtReturnTwice()) {
8158 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8159 }
8160
8161 // Analyze operands of the call, assigning locations to each operand.
8163 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8164
8165 if (IsVarArg) {
8166 unsigned NumArgs = Outs.size();
8167
8168 for (unsigned i = 0; i != NumArgs; ++i) {
8169 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8170 report_fatal_error("Passing SVE types to variadic functions is "
8171 "currently not supported");
8172 }
8173 }
8174
8175 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8176
8177 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8178 // Assign locations to each value returned by this call.
8180 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8181 *DAG.getContext());
8182 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8183
8184 // Check callee args/returns for SVE registers and set calling convention
8185 // accordingly.
8186 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8187 auto HasSVERegLoc = [](CCValAssign &Loc) {
8188 if (!Loc.isRegLoc())
8189 return false;
8190 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8191 AArch64::PPRRegClass.contains(Loc.getLocReg());
8192 };
8193 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8195 }
8196
8197 if (IsTailCall) {
8198 // Check if it's really possible to do a tail call.
8199 IsTailCall = isEligibleForTailCallOptimization(CLI);
8200
8201 // A sibling call is one where we're under the usual C ABI and not planning
8202 // to change that but can still do a tail call:
8203 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8204 CallConv != CallingConv::SwiftTail)
8205 IsSibCall = true;
8206
8207 if (IsTailCall)
8208 ++NumTailCalls;
8209 }
8210
8211 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8212 report_fatal_error("failed to perform tail call elimination on a call "
8213 "site marked musttail");
8214
8215 // Get a count of how many bytes are to be pushed on the stack.
8216 unsigned NumBytes = CCInfo.getStackSize();
8217
8218 if (IsSibCall) {
8219 // Since we're not changing the ABI to make this a tail call, the memory
8220 // operands are already available in the caller's incoming argument space.
8221 NumBytes = 0;
8222 }
8223
8224 // FPDiff is the byte offset of the call's argument area from the callee's.
8225 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8226 // by this amount for a tail call. In a sibling call it must be 0 because the
8227 // caller will deallocate the entire stack and the callee still expects its
8228 // arguments to begin at SP+0. Completely unused for non-tail calls.
8229 int FPDiff = 0;
8230
8231 if (IsTailCall && !IsSibCall) {
8232 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8233
8234 // Since callee will pop argument stack as a tail call, we must keep the
8235 // popped size 16-byte aligned.
8236 NumBytes = alignTo(NumBytes, 16);
8237
8238 // FPDiff will be negative if this tail call requires more space than we
8239 // would automatically have in our incoming argument space. Positive if we
8240 // can actually shrink the stack.
8241 FPDiff = NumReusableBytes - NumBytes;
8242
8243 // Update the required reserved area if this is the tail call requiring the
8244 // most argument stack space.
8245 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8246 FuncInfo->setTailCallReservedStack(-FPDiff);
8247
8248 // The stack pointer must be 16-byte aligned at all times it's used for a
8249 // memory operation, which in practice means at *all* times and in
8250 // particular across call boundaries. Therefore our own arguments started at
8251 // a 16-byte aligned SP and the delta applied for the tail call should
8252 // satisfy the same constraint.
8253 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8254 }
8255
8256 // Determine whether we need any streaming mode changes.
8257 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8258 if (CLI.CB)
8259 CalleeAttrs = SMEAttrs(*CLI.CB);
8260 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8261 CalleeAttrs = SMEAttrs(ES->getSymbol());
8262
8263 auto DescribeCallsite =
8265 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8266 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8267 R << ore::NV("Callee", ES->getSymbol());
8268 else if (CLI.CB && CLI.CB->getCalledFunction())
8269 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8270 else
8271 R << "unknown callee";
8272 R << "'";
8273 return R;
8274 };
8275
8276 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8277 if (RequiresLazySave) {
8278 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8279 MachinePointerInfo MPI =
8281 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8282 TPIDR2.FrameIndex,
8284 SDValue NumZaSaveSlicesAddr =
8285 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8286 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8287 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8288 DAG.getConstant(1, DL, MVT::i32));
8289 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8290 MPI, MVT::i16);
8291 Chain = DAG.getNode(
8292 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8293 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8294 TPIDR2ObjAddr);
8296 ORE.emit([&]() {
8297 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8298 CLI.CB)
8299 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8300 &MF.getFunction());
8301 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8302 });
8303 }
8304
8305 SDValue PStateSM;
8306 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8307 if (RequiresSMChange) {
8308 if (CallerAttrs.hasStreamingInterfaceOrBody())
8309 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8310 else if (CallerAttrs.hasNonStreamingInterface())
8311 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8312 else
8313 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8315 ORE.emit([&]() {
8316 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8317 CLI.CB)
8318 : OptimizationRemarkAnalysis("sme", "SMETransition",
8319 &MF.getFunction());
8320 DescribeCallsite(R) << " requires a streaming mode transition";
8321 return R;
8322 });
8323 }
8324
8325 SDValue ZTFrameIdx;
8326 MachineFrameInfo &MFI = MF.getFrameInfo();
8327 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8328
8329 // If the caller has ZT0 state which will not be preserved by the callee,
8330 // spill ZT0 before the call.
8331 if (ShouldPreserveZT0) {
8332 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8333 ZTFrameIdx = DAG.getFrameIndex(
8334 ZTObj,
8336
8337 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8338 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8339 }
8340
8341 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8342 // PSTATE.ZA before the call if there is no lazy-save active.
8343 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8344 assert((!DisableZA || !RequiresLazySave) &&
8345 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8346
8347 if (DisableZA)
8348 Chain = DAG.getNode(
8349 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8350 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8351 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8352
8353 // Adjust the stack pointer for the new arguments...
8354 // These operations are automatically eliminated by the prolog/epilog pass
8355 if (!IsSibCall)
8356 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8357
8358 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8360
8362 SmallSet<unsigned, 8> RegsUsed;
8363 SmallVector<SDValue, 8> MemOpChains;
8364 auto PtrVT = getPointerTy(DAG.getDataLayout());
8365
8366 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8367 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8368 for (const auto &F : Forwards) {
8369 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8370 RegsToPass.emplace_back(F.PReg, Val);
8371 }
8372 }
8373
8374 // Walk the register/memloc assignments, inserting copies/loads.
8375 unsigned ExtraArgLocs = 0;
8376 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8377 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8378 SDValue Arg = OutVals[i];
8379 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8380
8381 // Promote the value if needed.
8382 switch (VA.getLocInfo()) {
8383 default:
8384 llvm_unreachable("Unknown loc info!");
8385 case CCValAssign::Full:
8386 break;
8387 case CCValAssign::SExt:
8388 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8389 break;
8390 case CCValAssign::ZExt:
8391 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8392 break;
8393 case CCValAssign::AExt:
8394 if (Outs[i].ArgVT == MVT::i1) {
8395 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8396 //
8397 // Check if we actually have to do this, because the value may
8398 // already be zero-extended.
8399 //
8400 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8401 // and rely on DAGCombiner to fold this, because the following
8402 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8403 //
8404 // (ext (zext x)) -> (zext x)
8405 //
8406 // This will give us (zext i32), which we cannot remove, so
8407 // try to check this beforehand.
8408 if (!checkZExtBool(Arg, DAG)) {
8409 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8410 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8411 }
8412 }
8413 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8414 break;
8416 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8417 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8418 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8419 DAG.getConstant(32, DL, VA.getLocVT()));
8420 break;
8421 case CCValAssign::BCvt:
8422 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8423 break;
8424 case CCValAssign::Trunc:
8425 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8426 break;
8427 case CCValAssign::FPExt:
8428 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8429 break;
8431 bool isScalable = VA.getValVT().isScalableVT();
8432 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8433 "Indirect arguments should be scalable on most subtargets");
8434
8435 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8436 uint64_t PartSize = StoreSize;
8437 unsigned NumParts = 1;
8438 if (Outs[i].Flags.isInConsecutiveRegs()) {
8439 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8440 ++NumParts;
8441 StoreSize *= NumParts;
8442 }
8443
8444 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8445 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8446 MachineFrameInfo &MFI = MF.getFrameInfo();
8447 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8448 if (isScalable)
8450
8454 SDValue SpillSlot = Ptr;
8455
8456 // Ensure we generate all stores for each tuple part, whilst updating the
8457 // pointer after each store correctly using vscale.
8458 while (NumParts) {
8459 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8460 MemOpChains.push_back(Store);
8461
8462 NumParts--;
8463 if (NumParts > 0) {
8464 SDValue BytesIncrement;
8465 if (isScalable) {
8466 BytesIncrement = DAG.getVScale(
8467 DL, Ptr.getValueType(),
8468 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8469 } else {
8470 BytesIncrement = DAG.getConstant(
8471 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8472 Ptr.getValueType());
8473 }
8475 Flags.setNoUnsignedWrap(true);
8476
8477 MPI = MachinePointerInfo(MPI.getAddrSpace());
8478 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8479 BytesIncrement, Flags);
8480 ExtraArgLocs++;
8481 i++;
8482 }
8483 }
8484
8485 Arg = SpillSlot;
8486 break;
8487 }
8488
8489 if (VA.isRegLoc()) {
8490 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8491 Outs[0].VT == MVT::i64) {
8492 assert(VA.getLocVT() == MVT::i64 &&
8493 "unexpected calling convention register assignment");
8494 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8495 "unexpected use of 'returned'");
8496 IsThisReturn = true;
8497 }
8498 if (RegsUsed.count(VA.getLocReg())) {
8499 // If this register has already been used then we're trying to pack
8500 // parts of an [N x i32] into an X-register. The extension type will
8501 // take care of putting the two halves in the right place but we have to
8502 // combine them.
8503 SDValue &Bits =
8504 llvm::find_if(RegsToPass,
8505 [=](const std::pair<unsigned, SDValue> &Elt) {
8506 return Elt.first == VA.getLocReg();
8507 })
8508 ->second;
8509 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8510 // Call site info is used for function's parameter entry value
8511 // tracking. For now we track only simple cases when parameter
8512 // is transferred through whole register.
8514 [&VA](MachineFunction::ArgRegPair ArgReg) {
8515 return ArgReg.Reg == VA.getLocReg();
8516 });
8517 } else {
8518 // Add an extra level of indirection for streaming mode changes by
8519 // using a pseudo copy node that cannot be rematerialised between a
8520 // smstart/smstop and the call by the simple register coalescer.
8521 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8523 Arg.getValueType(), Arg);
8524 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8525 RegsUsed.insert(VA.getLocReg());
8526 const TargetOptions &Options = DAG.getTarget().Options;
8527 if (Options.EmitCallSiteInfo)
8528 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8529 }
8530 } else {
8531 assert(VA.isMemLoc());
8532
8533 SDValue DstAddr;
8534 MachinePointerInfo DstInfo;
8535
8536 // FIXME: This works on big-endian for composite byvals, which are the
8537 // common case. It should also work for fundamental types too.
8538 uint32_t BEAlign = 0;
8539 unsigned OpSize;
8540 if (VA.getLocInfo() == CCValAssign::Indirect ||
8542 OpSize = VA.getLocVT().getFixedSizeInBits();
8543 else
8544 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8545 : VA.getValVT().getSizeInBits();
8546 OpSize = (OpSize + 7) / 8;
8547 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8548 !Flags.isInConsecutiveRegs()) {
8549 if (OpSize < 8)
8550 BEAlign = 8 - OpSize;
8551 }
8552 unsigned LocMemOffset = VA.getLocMemOffset();
8553 int32_t Offset = LocMemOffset + BEAlign;
8554 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8555 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8556
8557 if (IsTailCall) {
8558 Offset = Offset + FPDiff;
8559 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8560
8561 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8562 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8563
8564 // Make sure any stack arguments overlapping with where we're storing
8565 // are loaded before this eventual operation. Otherwise they'll be
8566 // clobbered.
8567 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8568 } else {
8569 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8570
8571 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8572 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8573 }
8574
8575 if (Outs[i].Flags.isByVal()) {
8576 SDValue SizeNode =
8577 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8578 SDValue Cpy = DAG.getMemcpy(
8579 Chain, DL, DstAddr, Arg, SizeNode,
8580 Outs[i].Flags.getNonZeroByValAlign(),
8581 /*isVol = */ false, /*AlwaysInline = */ false,
8582 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8583
8584 MemOpChains.push_back(Cpy);
8585 } else {
8586 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8587 // promoted to a legal register type i32, we should truncate Arg back to
8588 // i1/i8/i16.
8589 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8590 VA.getValVT() == MVT::i16)
8591 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8592
8593 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8594 MemOpChains.push_back(Store);
8595 }
8596 }
8597 }
8598
8599 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8600 SDValue ParamPtr = StackPtr;
8601 if (IsTailCall) {
8602 // Create a dummy object at the top of the stack that can be used to get
8603 // the SP after the epilogue
8604 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8605 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8606 }
8607
8608 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8609 // describing the argument list. x4 contains the address of the
8610 // first stack parameter. x5 contains the size in bytes of all parameters
8611 // passed on the stack.
8612 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8613 RegsToPass.emplace_back(AArch64::X5,
8614 DAG.getConstant(NumBytes, DL, MVT::i64));
8615 }
8616
8617 if (!MemOpChains.empty())
8618 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8619
8620 SDValue InGlue;
8621 if (RequiresSMChange) {
8622
8623 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
8624 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
8625 InGlue = Chain.getValue(1);
8626
8627 SDValue NewChain = changeStreamingMode(
8628 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8629 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8630 Chain = NewChain.getValue(0);
8631 InGlue = NewChain.getValue(1);
8632 }
8633
8634 // Build a sequence of copy-to-reg nodes chained together with token chain
8635 // and flag operands which copy the outgoing args into the appropriate regs.
8636 for (auto &RegToPass : RegsToPass) {
8637 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8638 RegToPass.second, InGlue);
8639 InGlue = Chain.getValue(1);
8640 }
8641
8642 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8643 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8644 // node so that legalize doesn't hack it.
8645 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8646 auto GV = G->getGlobal();
8647 unsigned OpFlags =
8649 if (OpFlags & AArch64II::MO_GOT) {
8650 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8651 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8652 } else {
8653 const GlobalValue *GV = G->getGlobal();
8654 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8655 }
8656 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8657 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8658 Subtarget->isTargetMachO()) ||
8660 const char *Sym = S->getSymbol();
8661 if (UseGot) {
8663 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8664 } else {
8665 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8666 }
8667 }
8668
8669 // We don't usually want to end the call-sequence here because we would tidy
8670 // the frame up *after* the call, however in the ABI-changing tail-call case
8671 // we've carefully laid out the parameters so that when sp is reset they'll be
8672 // in the correct location.
8673 if (IsTailCall && !IsSibCall) {
8674 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8675 InGlue = Chain.getValue(1);
8676 }
8677
8678 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
8679
8680 std::vector<SDValue> Ops;
8681 Ops.push_back(Chain);
8682 Ops.push_back(Callee);
8683
8684 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8685 // be expanded to the call, directly followed by a special marker sequence and
8686 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8687 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8688 assert(!IsTailCall &&
8689 "tail calls cannot be marked with clang.arc.attachedcall");
8691
8692 // Add a target global address for the retainRV/claimRV runtime function
8693 // just before the call target.
8694 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8695 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8696 Ops.insert(Ops.begin() + 1, GA);
8697 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8699 } else if (GuardWithBTI) {
8701 }
8702
8703 if (IsTailCall) {
8704 // Each tail call may have to adjust the stack by a different amount, so
8705 // this information must travel along with the operation for eventual
8706 // consumption by emitEpilogue.
8707 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8708 }
8709
8710 if (CLI.PAI) {
8711 const uint64_t Key = CLI.PAI->Key;
8712 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
8713 "Invalid auth call key");
8714
8715 // Split the discriminator into address/integer components.
8716 SDValue AddrDisc, IntDisc;
8717 std::tie(IntDisc, AddrDisc) =
8718 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
8719
8720 if (Opc == AArch64ISD::CALL_RVMARKER)
8722 else
8724 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
8725 Ops.push_back(IntDisc);
8726 Ops.push_back(AddrDisc);
8727 }
8728
8729 // Add argument registers to the end of the list so that they are known live
8730 // into the call.
8731 for (auto &RegToPass : RegsToPass)
8732 Ops.push_back(DAG.getRegister(RegToPass.first,
8733 RegToPass.second.getValueType()));
8734
8735 // Add a register mask operand representing the call-preserved registers.
8736 const uint32_t *Mask;
8737 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8738 if (IsThisReturn) {
8739 // For 'this' returns, use the X0-preserving mask if applicable
8740 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8741 if (!Mask) {
8742 IsThisReturn = false;
8743 Mask = TRI->getCallPreservedMask(MF, CallConv);
8744 }
8745 } else
8746 Mask = TRI->getCallPreservedMask(MF, CallConv);
8747
8748 if (Subtarget->hasCustomCallingConv())
8749 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8750
8751 if (TRI->isAnyArgRegReserved(MF))
8752 TRI->emitReservedArgRegCallError(MF);
8753
8754 assert(Mask && "Missing call preserved mask for calling convention");
8755 Ops.push_back(DAG.getRegisterMask(Mask));
8756
8757 if (InGlue.getNode())
8758 Ops.push_back(InGlue);
8759
8760 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8761
8762 // If we're doing a tall call, use a TC_RETURN here rather than an
8763 // actual call instruction.
8764 if (IsTailCall) {
8766 SDValue Ret = DAG.getNode(Opc, DL, NodeTys, Ops);
8767 if (IsCFICall)
8768 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8769
8770 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8771 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8772 return Ret;
8773 }
8774
8775 // Returns a chain and a flag for retval copy to use.
8776 Chain = DAG.getNode(Opc, DL, NodeTys, Ops);
8777 if (IsCFICall)
8778 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8779
8780 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8781 InGlue = Chain.getValue(1);
8782 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8783
8784 uint64_t CalleePopBytes =
8785 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8786
8787 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8788 InGlue = Chain.getValue(1);
8789
8790 // Handle result values, copying them out of physregs into vregs that we
8791 // return.
8792 SDValue Result = LowerCallResult(
8793 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8794 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8795
8796 if (!Ins.empty())
8797 InGlue = Result.getValue(Result->getNumValues() - 1);
8798
8799 if (RequiresSMChange) {
8800 assert(PStateSM && "Expected a PStateSM to be set");
8802 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8803 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8804 InGlue = Result.getValue(1);
8805
8806 Result =
8808 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
8809 }
8810
8811 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8812 // Unconditionally resume ZA.
8813 Result = DAG.getNode(
8814 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8815 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8816 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8817
8818 if (ShouldPreserveZT0)
8819 Result =
8820 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8821 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8822
8823 if (RequiresLazySave) {
8824 // Conditionally restore the lazy save using a pseudo node.
8825 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8826 SDValue RegMask = DAG.getRegisterMask(
8827 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8828 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8829 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8830 SDValue TPIDR2_EL0 = DAG.getNode(
8831 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8832 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8833
8834 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8835 // RESTORE_ZA pseudo.
8836 SDValue Glue;
8837 SDValue TPIDR2Block = DAG.getFrameIndex(
8838 TPIDR2.FrameIndex,
8840 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8841 Result =
8842 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8843 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8844 RestoreRoutine, RegMask, Result.getValue(1)});
8845
8846 // Finally reset the TPIDR2_EL0 register to 0.
8847 Result = DAG.getNode(
8848 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8849 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8850 DAG.getConstant(0, DL, MVT::i64));
8851 TPIDR2.Uses++;
8852 }
8853
8854 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8855 for (unsigned I = 0; I < InVals.size(); ++I) {
8856 // The smstart/smstop is chained as part of the call, but when the
8857 // resulting chain is discarded (which happens when the call is not part
8858 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8859 // smstart/smstop is chained to the result value. We can do that by doing
8860 // a vreg -> vreg copy.
8862 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8863 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8864 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8865 InVals[I].getValueType());
8866 }
8867 }
8868
8869 if (CallConv == CallingConv::PreserveNone) {
8870 for (const ISD::OutputArg &O : Outs) {
8871 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
8872 O.Flags.isSwiftAsync()) {
8875 MF.getFunction(),
8876 "Swift attributes can't be used with preserve_none",
8877 DL.getDebugLoc()));
8878 break;
8879 }
8880 }
8881 }
8882
8883 return Result;
8884}
8885
8886bool AArch64TargetLowering::CanLowerReturn(
8887 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8888 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8889 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8891 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8892 return CCInfo.CheckReturn(Outs, RetCC);
8893}
8894
8895SDValue
8896AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8897 bool isVarArg,
8899 const SmallVectorImpl<SDValue> &OutVals,
8900 const SDLoc &DL, SelectionDAG &DAG) const {
8901 auto &MF = DAG.getMachineFunction();
8902 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8903
8904 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8906 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8907 CCInfo.AnalyzeReturn(Outs, RetCC);
8908
8909 // Copy the result values into the output registers.
8910 SDValue Glue;
8912 SmallSet<unsigned, 4> RegsUsed;
8913 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8914 ++i, ++realRVLocIdx) {
8915 CCValAssign &VA = RVLocs[i];
8916 assert(VA.isRegLoc() && "Can only return in registers!");
8917 SDValue Arg = OutVals[realRVLocIdx];
8918
8919 switch (VA.getLocInfo()) {
8920 default:
8921 llvm_unreachable("Unknown loc info!");
8922 case CCValAssign::Full:
8923 if (Outs[i].ArgVT == MVT::i1) {
8924 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8925 // value. This is strictly redundant on Darwin (which uses "zeroext
8926 // i1"), but will be optimised out before ISel.
8927 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8928 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8929 }
8930 break;
8931 case CCValAssign::BCvt:
8932 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8933 break;
8934 case CCValAssign::AExt:
8935 case CCValAssign::ZExt:
8936 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8937 break;
8939 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8940 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8941 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8942 DAG.getConstant(32, DL, VA.getLocVT()));
8943 break;
8944 }
8945
8946 if (RegsUsed.count(VA.getLocReg())) {
8947 SDValue &Bits =
8948 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8949 return Elt.first == VA.getLocReg();
8950 })->second;
8951 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8952 } else {
8953 RetVals.emplace_back(VA.getLocReg(), Arg);
8954 RegsUsed.insert(VA.getLocReg());
8955 }
8956 }
8957
8958 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8959
8960 // Emit SMSTOP before returning from a locally streaming function
8961 SMEAttrs FuncAttrs(MF.getFunction());
8962 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8963 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8964 Register Reg = FuncInfo->getPStateSMReg();
8965 assert(Reg.isValid() && "PStateSM Register is invalid");
8966 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8967 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8968 /*Glue*/ SDValue(),
8970 } else
8971 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8972 /*Glue*/ SDValue(), AArch64SME::Always);
8973 Glue = Chain.getValue(1);
8974 }
8975
8976 SmallVector<SDValue, 4> RetOps(1, Chain);
8977 for (auto &RetVal : RetVals) {
8978 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8979 isPassedInFPR(RetVal.second.getValueType()))
8980 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8981 RetVal.second.getValueType(), RetVal.second);
8982 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8983 Glue = Chain.getValue(1);
8984 RetOps.push_back(
8985 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8986 }
8987
8988 // Windows AArch64 ABIs require that for returning structs by value we copy
8989 // the sret argument into X0 for the return.
8990 // We saved the argument into a virtual register in the entry block,
8991 // so now we copy the value out and into X0.
8992 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8993 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8995
8996 unsigned RetValReg = AArch64::X0;
8997 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8998 RetValReg = AArch64::X8;
8999 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9000 Glue = Chain.getValue(1);
9001
9002 RetOps.push_back(
9003 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9004 }
9005
9006 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9007 if (I) {
9008 for (; *I; ++I) {
9009 if (AArch64::GPR64RegClass.contains(*I))
9010 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9011 else if (AArch64::FPR64RegClass.contains(*I))
9012 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9013 else
9014 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9015 }
9016 }
9017
9018 RetOps[0] = Chain; // Update chain.
9019
9020 // Add the glue if we have it.
9021 if (Glue.getNode())
9022 RetOps.push_back(Glue);
9023
9024 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9025 // ARM64EC entry thunks use a special return sequence: instead of a regular
9026 // "ret" instruction, they need to explicitly call the emulator.
9027 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9028 SDValue Arm64ECRetDest =
9029 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9030 Arm64ECRetDest =
9031 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9032 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9034 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9035 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9036 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9037 }
9038
9039 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9040}
9041
9042//===----------------------------------------------------------------------===//
9043// Other Lowering Code
9044//===----------------------------------------------------------------------===//
9045
9046SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9047 SelectionDAG &DAG,
9048 unsigned Flag) const {
9049 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9050 N->getOffset(), Flag);
9051}
9052
9053SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9054 SelectionDAG &DAG,
9055 unsigned Flag) const {
9056 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9057}
9058
9059SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9060 SelectionDAG &DAG,
9061 unsigned Flag) const {
9062 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9063 N->getOffset(), Flag);
9064}
9065
9066SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9067 SelectionDAG &DAG,
9068 unsigned Flag) const {
9069 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9070}
9071
9072SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9073 SelectionDAG &DAG,
9074 unsigned Flag) const {
9075 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9076}
9077
9078// (loadGOT sym)
9079template <class NodeTy>
9080SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9081 unsigned Flags) const {
9082 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9083 SDLoc DL(N);
9084 EVT Ty = getPointerTy(DAG.getDataLayout());
9085 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9086 // FIXME: Once remat is capable of dealing with instructions with register
9087 // operands, expand this into two nodes instead of using a wrapper node.
9088 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9089}
9090
9091// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9092template <class NodeTy>
9093SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9094 unsigned Flags) const {
9095 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9096 SDLoc DL(N);
9097 EVT Ty = getPointerTy(DAG.getDataLayout());
9098 const unsigned char MO_NC = AArch64II::MO_NC;
9099 return DAG.getNode(
9101 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9102 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9103 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9104 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9105}
9106
9107// (addlow (adrp %hi(sym)) %lo(sym))
9108template <class NodeTy>
9109SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9110 unsigned Flags) const {
9111 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9112 SDLoc DL(N);
9113 EVT Ty = getPointerTy(DAG.getDataLayout());
9114 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9115 SDValue Lo = getTargetNode(N, Ty, DAG,
9118 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9119}
9120
9121// (adr sym)
9122template <class NodeTy>
9123SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9124 unsigned Flags) const {
9125 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9126 SDLoc DL(N);
9127 EVT Ty = getPointerTy(DAG.getDataLayout());
9128 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9129 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9130}
9131
9132SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9133 SelectionDAG &DAG) const {
9134 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9135 const GlobalValue *GV = GN->getGlobal();
9136 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9137
9138 if (OpFlags != AArch64II::MO_NO_FLAG)
9139 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9140 "unexpected offset in global node");
9141
9142 // This also catches the large code model case for Darwin, and tiny code
9143 // model with got relocations.
9144 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9145 return getGOT(GN, DAG, OpFlags);
9146 }
9147
9151 Result = getAddrLarge(GN, DAG, OpFlags);
9152 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9153 Result = getAddrTiny(GN, DAG, OpFlags);
9154 } else {
9155 Result = getAddr(GN, DAG, OpFlags);
9156 }
9157 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9158 SDLoc DL(GN);
9160 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9162 return Result;
9163}
9164
9165/// Convert a TLS address reference into the correct sequence of loads
9166/// and calls to compute the variable's address (for Darwin, currently) and
9167/// return an SDValue containing the final node.
9168
9169/// Darwin only has one TLS scheme which must be capable of dealing with the
9170/// fully general situation, in the worst case. This means:
9171/// + "extern __thread" declaration.
9172/// + Defined in a possibly unknown dynamic library.
9173///
9174/// The general system is that each __thread variable has a [3 x i64] descriptor
9175/// which contains information used by the runtime to calculate the address. The
9176/// only part of this the compiler needs to know about is the first xword, which
9177/// contains a function pointer that must be called with the address of the
9178/// entire descriptor in "x0".
9179///
9180/// Since this descriptor may be in a different unit, in general even the
9181/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9182/// is:
9183/// adrp x0, _var@TLVPPAGE
9184/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9185/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9186/// ; the function pointer
9187/// blr x1 ; Uses descriptor address in x0
9188/// ; Address of _var is now in x0.
9189///
9190/// If the address of _var's descriptor *is* known to the linker, then it can
9191/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9192/// a slight efficiency gain.
9193SDValue
9194AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9195 SelectionDAG &DAG) const {
9196 assert(Subtarget->isTargetDarwin() &&
9197 "This function expects a Darwin target");
9198
9199 SDLoc DL(Op);
9200 MVT PtrVT = getPointerTy(DAG.getDataLayout());
9201 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9202 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
9203
9204 SDValue TLVPAddr =
9205 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9206 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
9207
9208 // The first entry in the descriptor is a function pointer that we must call
9209 // to obtain the address of the variable.
9210 SDValue Chain = DAG.getEntryNode();
9211 SDValue FuncTLVGet = DAG.getLoad(
9212 PtrMemVT, DL, Chain, DescAddr,
9214 Align(PtrMemVT.getSizeInBits() / 8),
9216 Chain = FuncTLVGet.getValue(1);
9217
9218 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9219 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
9220
9222 MFI.setAdjustsStack(true);
9223
9224 // TLS calls preserve all registers except those that absolutely must be
9225 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9226 // silly).
9227 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9228 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9229 if (Subtarget->hasCustomCallingConv())
9230 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
9231
9232 // Finally, we can make the call. This is just a degenerate version of a
9233 // normal AArch64 call node: x0 takes the address of the descriptor, and
9234 // returns the address of the variable in this thread.
9235 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
9236 Chain =
9237 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
9238 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
9239 DAG.getRegisterMask(Mask), Chain.getValue(1));
9240 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
9241}
9242
9243/// Convert a thread-local variable reference into a sequence of instructions to
9244/// compute the variable's address for the local exec TLS model of ELF targets.
9245/// The sequence depends on the maximum TLS area size.
9246SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9247 SDValue ThreadBase,
9248 const SDLoc &DL,
9249 SelectionDAG &DAG) const {
9250 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9251 SDValue TPOff, Addr;
9252
9253 switch (DAG.getTarget().Options.TLSSize) {
9254 default:
9255 llvm_unreachable("Unexpected TLS size");
9256
9257 case 12: {
9258 // mrs x0, TPIDR_EL0
9259 // add x0, x0, :tprel_lo12:a
9261 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
9262 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9263 Var,
9264 DAG.getTargetConstant(0, DL, MVT::i32)),
9265 0);
9266 }
9267
9268 case 24: {
9269 // mrs x0, TPIDR_EL0
9270 // add x0, x0, :tprel_hi12:a
9271 // add x0, x0, :tprel_lo12_nc:a
9272 SDValue HiVar = DAG.getTargetGlobalAddress(
9273 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9274 SDValue LoVar = DAG.getTargetGlobalAddress(
9275 GV, DL, PtrVT, 0,
9277 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9278 HiVar,
9279 DAG.getTargetConstant(0, DL, MVT::i32)),
9280 0);
9281 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
9282 LoVar,
9283 DAG.getTargetConstant(0, DL, MVT::i32)),
9284 0);
9285 }
9286
9287 case 32: {
9288 // mrs x1, TPIDR_EL0
9289 // movz x0, #:tprel_g1:a
9290 // movk x0, #:tprel_g0_nc:a
9291 // add x0, x1, x0
9292 SDValue HiVar = DAG.getTargetGlobalAddress(
9293 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
9294 SDValue LoVar = DAG.getTargetGlobalAddress(
9295 GV, DL, PtrVT, 0,
9297 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9298 DAG.getTargetConstant(16, DL, MVT::i32)),
9299 0);
9300 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9301 DAG.getTargetConstant(0, DL, MVT::i32)),
9302 0);
9303 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9304 }
9305
9306 case 48: {
9307 // mrs x1, TPIDR_EL0
9308 // movz x0, #:tprel_g2:a
9309 // movk x0, #:tprel_g1_nc:a
9310 // movk x0, #:tprel_g0_nc:a
9311 // add x0, x1, x0
9312 SDValue HiVar = DAG.getTargetGlobalAddress(
9313 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
9314 SDValue MiVar = DAG.getTargetGlobalAddress(
9315 GV, DL, PtrVT, 0,
9317 SDValue LoVar = DAG.getTargetGlobalAddress(
9318 GV, DL, PtrVT, 0,
9320 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9321 DAG.getTargetConstant(32, DL, MVT::i32)),
9322 0);
9323 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9324 DAG.getTargetConstant(16, DL, MVT::i32)),
9325 0);
9326 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9327 DAG.getTargetConstant(0, DL, MVT::i32)),
9328 0);
9329 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9330 }
9331 }
9332}
9333
9334/// When accessing thread-local variables under either the general-dynamic or
9335/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9336/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9337/// is a function pointer to carry out the resolution.
9338///
9339/// The sequence is:
9340/// adrp x0, :tlsdesc:var
9341/// ldr x1, [x0, #:tlsdesc_lo12:var]
9342/// add x0, x0, #:tlsdesc_lo12:var
9343/// .tlsdesccall var
9344/// blr x1
9345/// (TPIDR_EL0 offset now in x0)
9346///
9347/// The above sequence must be produced unscheduled, to enable the linker to
9348/// optimize/relax this sequence.
9349/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9350/// above sequence, and expanded really late in the compilation flow, to ensure
9351/// the sequence is produced as per above.
9352SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9353 const SDLoc &DL,
9354 SelectionDAG &DAG) const {
9355 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9356
9357 SDValue Chain = DAG.getEntryNode();
9358 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9359
9360 Chain =
9361 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9362 SDValue Glue = Chain.getValue(1);
9363
9364 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9365}
9366
9367SDValue
9368AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9369 SelectionDAG &DAG) const {
9370 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9371
9372 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9373
9375
9377 if (Model == TLSModel::LocalDynamic)
9379 }
9380
9382 Model != TLSModel::LocalExec)
9383 report_fatal_error("ELF TLS only supported in small memory model or "
9384 "in local exec TLS model");
9385 // Different choices can be made for the maximum size of the TLS area for a
9386 // module. For the small address model, the default TLS size is 16MiB and the
9387 // maximum TLS size is 4GiB.
9388 // FIXME: add tiny and large code model support for TLS access models other
9389 // than local exec. We currently generate the same code as small for tiny,
9390 // which may be larger than needed.
9391
9392 SDValue TPOff;
9393 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9394 SDLoc DL(Op);
9395 const GlobalValue *GV = GA->getGlobal();
9396
9397 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9398
9399 if (Model == TLSModel::LocalExec) {
9400 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9401 } else if (Model == TLSModel::InitialExec) {
9402 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9403 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9404 } else if (Model == TLSModel::LocalDynamic) {
9405 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9406 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9407 // the beginning of the module's TLS region, followed by a DTPREL offset
9408 // calculation.
9409
9410 // These accesses will need deduplicating if there's more than one.
9411 AArch64FunctionInfo *MFI =
9414
9415 // The call needs a relocation too for linker relaxation. It doesn't make
9416 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9417 // the address.
9418 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9420
9421 // Now we can calculate the offset from TPIDR_EL0 to this module's
9422 // thread-local area.
9423 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9424
9425 // Now use :dtprel_whatever: operations to calculate this variable's offset
9426 // in its thread-storage area.
9427 SDValue HiVar = DAG.getTargetGlobalAddress(
9428 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9429 SDValue LoVar = DAG.getTargetGlobalAddress(
9430 GV, DL, MVT::i64, 0,
9432
9433 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9434 DAG.getTargetConstant(0, DL, MVT::i32)),
9435 0);
9436 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9437 DAG.getTargetConstant(0, DL, MVT::i32)),
9438 0);
9439 } else if (Model == TLSModel::GeneralDynamic) {
9440 // The call needs a relocation too for linker relaxation. It doesn't make
9441 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9442 // the address.
9443 SDValue SymAddr =
9444 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9445
9446 // Finally we can make a call to calculate the offset from tpidr_el0.
9447 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9448 } else
9449 llvm_unreachable("Unsupported ELF TLS access model");
9450
9451 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9452}
9453
9454SDValue
9455AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9456 SelectionDAG &DAG) const {
9457 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9458
9459 SDValue Chain = DAG.getEntryNode();
9460 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9461 SDLoc DL(Op);
9462
9463 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9464
9465 // Load the ThreadLocalStoragePointer from the TEB
9466 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9467 SDValue TLSArray =
9468 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9469 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9470 Chain = TLSArray.getValue(1);
9471
9472 // Load the TLS index from the C runtime;
9473 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9474 // This also does the same as LOADgot, but using a generic i32 load,
9475 // while LOADgot only loads i64.
9476 SDValue TLSIndexHi =
9477 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9478 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9479 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9480 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9481 SDValue TLSIndex =
9482 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9483 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9484 Chain = TLSIndex.getValue(1);
9485
9486 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9487 // offset into the TLSArray.
9488 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9489 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9490 DAG.getConstant(3, DL, PtrVT));
9491 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9492 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9494 Chain = TLS.getValue(1);
9495
9496 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9497 const GlobalValue *GV = GA->getGlobal();
9498 SDValue TGAHi = DAG.getTargetGlobalAddress(
9499 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9500 SDValue TGALo = DAG.getTargetGlobalAddress(
9501 GV, DL, PtrVT, 0,
9503
9504 // Add the offset from the start of the .tls section (section base).
9505 SDValue Addr =
9506 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9507 DAG.getTargetConstant(0, DL, MVT::i32)),
9508 0);
9509 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9510 return Addr;
9511}
9512
9513SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9514 SelectionDAG &DAG) const {
9515 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9516 if (DAG.getTarget().useEmulatedTLS())
9517 return LowerToTLSEmulatedModel(GA, DAG);
9518
9519 if (Subtarget->isTargetDarwin())
9520 return LowerDarwinGlobalTLSAddress(Op, DAG);
9521 if (Subtarget->isTargetELF())
9522 return LowerELFGlobalTLSAddress(Op, DAG);
9523 if (Subtarget->isTargetWindows())
9524 return LowerWindowsGlobalTLSAddress(Op, DAG);
9525
9526 llvm_unreachable("Unexpected platform trying to use TLS");
9527}
9528
9529//===----------------------------------------------------------------------===//
9530// PtrAuthGlobalAddress lowering
9531//
9532// We have 3 lowering alternatives to choose from:
9533// - MOVaddrPAC: similar to MOVaddr, with added PAC.
9534// If the GV doesn't need a GOT load (i.e., is locally defined)
9535// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
9536//
9537// - LOADgotPAC: similar to LOADgot, with added PAC.
9538// If the GV needs a GOT load, materialize the pointer using the usual
9539// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
9540// section is assumed to be read-only (for example, via relro mechanism). See
9541// LowerMOVaddrPAC.
9542//
9543// - LOADauthptrstatic: similar to LOADgot, but use a
9544// special stub slot instead of a GOT slot.
9545// Load a signed pointer for symbol 'sym' from a stub slot named
9546// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
9547// resolving. This usually lowers to adrp+ldr, but also emits an entry into
9548// .data with an
9549// @AUTH relocation. See LowerLOADauthptrstatic.
9550//
9551// All 3 are pseudos that are expand late to longer sequences: this lets us
9552// provide integrity guarantees on the to-be-signed intermediate values.
9553//
9554// LOADauthptrstatic is undesirable because it requires a large section filled
9555// with often similarly-signed pointers, making it a good harvesting target.
9556// Thus, it's only used for ptrauth references to extern_weak to avoid null
9557// checks.
9558
9559SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
9560 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
9561 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
9562 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
9563 assert(TGN->getGlobal()->hasExternalWeakLinkage());
9564
9565 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
9566 // offset alone as a pointer if the symbol wasn't available, which would
9567 // probably break null checks in users. Ptrauth complicates things further:
9568 // error out.
9569 if (TGN->getOffset() != 0)
9571 "unsupported non-zero offset in weak ptrauth global reference");
9572
9573 if (!isNullConstant(AddrDiscriminator))
9574 report_fatal_error("unsupported weak addr-div ptrauth global");
9575
9576 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
9577 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
9578 {TGA, Key, Discriminator}),
9579 0);
9580}
9581
9582SDValue
9583AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
9584 SelectionDAG &DAG) const {
9585 SDValue Ptr = Op.getOperand(0);
9586 uint64_t KeyC = Op.getConstantOperandVal(1);
9587 SDValue AddrDiscriminator = Op.getOperand(2);
9588 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
9589 EVT VT = Op.getValueType();
9590 SDLoc DL(Op);
9591
9592 if (KeyC > AArch64PACKey::LAST)
9593 report_fatal_error("key in ptrauth global out of range [0, " +
9594 Twine((int)AArch64PACKey::LAST) + "]");
9595
9596 // Blend only works if the integer discriminator is 16-bit wide.
9597 if (!isUInt<16>(DiscriminatorC))
9599 "constant discriminator in ptrauth global out of range [0, 0xffff]");
9600
9601 // Choosing between 3 lowering alternatives is target-specific.
9602 if (!Subtarget->isTargetELF())
9603 report_fatal_error("ptrauth global lowering is only implemented for ELF");
9604
9605 int64_t PtrOffsetC = 0;
9606 if (Ptr.getOpcode() == ISD::ADD) {
9607 PtrOffsetC = Ptr.getConstantOperandVal(1);
9608 Ptr = Ptr.getOperand(0);
9609 }
9610 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
9611 const GlobalValue *PtrGV = PtrN->getGlobal();
9612
9613 // Classify the reference to determine whether it needs a GOT load.
9614 const unsigned OpFlags =
9615 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
9616 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
9617 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
9618 "unsupported non-GOT op flags on ptrauth global reference");
9619
9620 // Fold any offset into the GV; our pseudos expect it there.
9621 PtrOffsetC += PtrN->getOffset();
9622 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
9623 /*TargetFlags=*/0);
9624 assert(PtrN->getTargetFlags() == 0 &&
9625 "unsupported target flags on ptrauth global");
9626
9627 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
9628 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
9629 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
9630 ? AddrDiscriminator
9631 : DAG.getRegister(AArch64::XZR, MVT::i64);
9632
9633 // No GOT load needed -> MOVaddrPAC
9634 if (!NeedsGOTLoad) {
9635 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
9636 return SDValue(
9637 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
9638 {TPtr, Key, TAddrDiscriminator, Discriminator}),
9639 0);
9640 }
9641
9642 // GOT load -> LOADgotPAC
9643 // Note that we disallow extern_weak refs to avoid null checks later.
9644 if (!PtrGV->hasExternalWeakLinkage())
9645 return SDValue(
9646 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
9647 {TPtr, Key, TAddrDiscriminator, Discriminator}),
9648 0);
9649
9650 // extern_weak ref -> LOADauthptrstatic
9651 return LowerPtrAuthGlobalAddressStatically(
9652 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
9653 DAG);
9654}
9655
9656// Looks through \param Val to determine the bit that can be used to
9657// check the sign of the value. It returns the unextended value and
9658// the sign bit position.
9659std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9660 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9661 return {Val.getOperand(0),
9662 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9663 1};
9664
9665 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9666 return {Val.getOperand(0),
9667 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9668
9669 return {Val, Val.getValueSizeInBits() - 1};
9670}
9671
9672SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9673 SDValue Chain = Op.getOperand(0);
9674 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9675 SDValue LHS = Op.getOperand(2);
9676 SDValue RHS = Op.getOperand(3);
9677 SDValue Dest = Op.getOperand(4);
9678 SDLoc dl(Op);
9679
9681 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9682 // will not be produced, as they are conditional branch instructions that do
9683 // not set flags.
9684 bool ProduceNonFlagSettingCondBr =
9685 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9686
9687 // Handle f128 first, since lowering it will result in comparing the return
9688 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9689 // is expecting to deal with.
9690 if (LHS.getValueType() == MVT::f128) {
9691 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9692
9693 // If softenSetCCOperands returned a scalar, we need to compare the result
9694 // against zero to select between true and false values.
9695 if (!RHS.getNode()) {
9696 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9697 CC = ISD::SETNE;
9698 }
9699 }
9700
9701 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9702 // instruction.
9703 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9704 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9705 // Only lower legal XALUO ops.
9706 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9707 return SDValue();
9708
9709 // The actual operation with overflow check.
9711 SDValue Value, Overflow;
9712 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9713
9714 if (CC == ISD::SETNE)
9715 OFCC = getInvertedCondCode(OFCC);
9716 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9717
9718 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9719 Overflow);
9720 }
9721
9722 if (LHS.getValueType().isInteger()) {
9723 assert((LHS.getValueType() == RHS.getValueType()) &&
9724 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9725
9726 // If the RHS of the comparison is zero, we can potentially fold this
9727 // to a specialized branch.
9728 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9729 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9730 if (CC == ISD::SETEQ) {
9731 // See if we can use a TBZ to fold in an AND as well.
9732 // TBZ has a smaller branch displacement than CBZ. If the offset is
9733 // out of bounds, a late MI-layer pass rewrites branches.
9734 // 403.gcc is an example that hits this case.
9735 if (LHS.getOpcode() == ISD::AND &&
9736 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9737 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9738 SDValue Test = LHS.getOperand(0);
9739 uint64_t Mask = LHS.getConstantOperandVal(1);
9740 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9741 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9742 Dest);
9743 }
9744
9745 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9746 } else if (CC == ISD::SETNE) {
9747 // See if we can use a TBZ to fold in an AND as well.
9748 // TBZ has a smaller branch displacement than CBZ. If the offset is
9749 // out of bounds, a late MI-layer pass rewrites branches.
9750 // 403.gcc is an example that hits this case.
9751 if (LHS.getOpcode() == ISD::AND &&
9752 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9753 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9754 SDValue Test = LHS.getOperand(0);
9755 uint64_t Mask = LHS.getConstantOperandVal(1);
9756 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9757 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9758 Dest);
9759 }
9760
9761 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9762 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9763 // Don't combine AND since emitComparison converts the AND to an ANDS
9764 // (a.k.a. TST) and the test in the test bit and branch instruction
9765 // becomes redundant. This would also increase register pressure.
9766 uint64_t SignBitPos;
9767 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9768 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9769 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9770 }
9771 }
9772 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9773 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9774 // Don't combine AND since emitComparison converts the AND to an ANDS
9775 // (a.k.a. TST) and the test in the test bit and branch instruction
9776 // becomes redundant. This would also increase register pressure.
9777 uint64_t SignBitPos;
9778 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9779 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9780 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9781 }
9782
9783 SDValue CCVal;
9784 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9785 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9786 Cmp);
9787 }
9788
9789 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9790 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9791
9792 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9793 // clean. Some of them require two branches to implement.
9794 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9795 AArch64CC::CondCode CC1, CC2;
9796 changeFPCCToAArch64CC(CC, CC1, CC2);
9797 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9798 SDValue BR1 =
9799 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9800 if (CC2 != AArch64CC::AL) {
9801 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9802 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9803 Cmp);
9804 }
9805
9806 return BR1;
9807}
9808
9809SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9810 SelectionDAG &DAG) const {
9811 if (!Subtarget->isNeonAvailable() &&
9812 !Subtarget->useSVEForFixedLengthVectors())
9813 return SDValue();
9814
9815 EVT VT = Op.getValueType();
9816 EVT IntVT = VT.changeTypeToInteger();
9817 SDLoc DL(Op);
9818
9819 SDValue In1 = Op.getOperand(0);
9820 SDValue In2 = Op.getOperand(1);
9821 EVT SrcVT = In2.getValueType();
9822
9823 if (!SrcVT.bitsEq(VT))
9824 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9825
9826 if (VT.isScalableVector())
9827 IntVT =
9829
9830 if (VT.isFixedLengthVector() &&
9831 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9832 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9833
9834 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9835 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9836
9837 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9838 return convertFromScalableVector(DAG, VT, Res);
9839 }
9840
9841 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9842 if (VT.isScalableVector())
9843 return getSVESafeBitCast(VT, Op, DAG);
9844
9845 return DAG.getBitcast(VT, Op);
9846 };
9847
9848 SDValue VecVal1, VecVal2;
9849 EVT VecVT;
9850 auto SetVecVal = [&](int Idx = -1) {
9851 if (!VT.isVector()) {
9852 VecVal1 =
9853 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9854 VecVal2 =
9855 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9856 } else {
9857 VecVal1 = BitCast(VecVT, In1, DAG);
9858 VecVal2 = BitCast(VecVT, In2, DAG);
9859 }
9860 };
9861 if (VT.isVector()) {
9862 VecVT = IntVT;
9863 SetVecVal();
9864 } else if (VT == MVT::f64) {
9865 VecVT = MVT::v2i64;
9866 SetVecVal(AArch64::dsub);
9867 } else if (VT == MVT::f32) {
9868 VecVT = MVT::v4i32;
9869 SetVecVal(AArch64::ssub);
9870 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9871 VecVT = MVT::v8i16;
9872 SetVecVal(AArch64::hsub);
9873 } else {
9874 llvm_unreachable("Invalid type for copysign!");
9875 }
9876
9877 unsigned BitWidth = In1.getScalarValueSizeInBits();
9878 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9879
9880 // We want to materialize a mask with every bit but the high bit set, but the
9881 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9882 // 64-bit elements. Instead, materialize all bits set and then negate that.
9883 if (VT == MVT::f64 || VT == MVT::v2f64) {
9884 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9885 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9886 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9887 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9888 }
9889
9890 SDValue BSP =
9891 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9892 if (VT == MVT::f16 || VT == MVT::bf16)
9893 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9894 if (VT == MVT::f32)
9895 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9896 if (VT == MVT::f64)
9897 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9898
9899 return BitCast(VT, BSP, DAG);
9900}
9901
9902SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9903 SelectionDAG &DAG) const {
9905 Attribute::NoImplicitFloat))
9906 return SDValue();
9907
9908 EVT VT = Op.getValueType();
9909 if (VT.isScalableVector() ||
9911 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9912
9913 if (!Subtarget->isNeonAvailable())
9914 return SDValue();
9915
9916 bool IsParity = Op.getOpcode() == ISD::PARITY;
9917 SDValue Val = Op.getOperand(0);
9918 SDLoc DL(Op);
9919
9920 // for i32, general parity function using EORs is more efficient compared to
9921 // using floating point
9922 if (VT == MVT::i32 && IsParity)
9923 return SDValue();
9924
9925 // If there is no CNT instruction available, GPR popcount can
9926 // be more efficiently lowered to the following sequence that uses
9927 // AdvSIMD registers/instructions as long as the copies to/from
9928 // the AdvSIMD registers are cheap.
9929 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9930 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9931 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9932 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9933 if (VT == MVT::i32 || VT == MVT::i64) {
9934 if (VT == MVT::i32)
9935 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9936 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9937
9938 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9939 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9940 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9941 DAG.getConstant(0, DL, MVT::i64));
9942
9943 if (IsParity)
9944 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9945 DAG.getConstant(1, DL, MVT::i32));
9946
9947 if (VT == MVT::i64)
9948 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9949 return UaddLV;
9950 } else if (VT == MVT::i128) {
9951 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9952
9953 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9954 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9955 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9956 DAG.getConstant(0, DL, MVT::i64));
9957
9958 if (IsParity)
9959 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9960 DAG.getConstant(1, DL, MVT::i32));
9961
9962 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9963 }
9964
9965 assert(!IsParity && "ISD::PARITY of vector types not supported");
9966
9967 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9968 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9969 "Unexpected type for custom ctpop lowering");
9970
9971 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9972 Val = DAG.getBitcast(VT8Bit, Val);
9973 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9974
9975 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9976 unsigned EltSize = 8;
9977 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9978 while (EltSize != VT.getScalarSizeInBits()) {
9979 EltSize *= 2;
9980 NumElts /= 2;
9981 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9982 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
9983 }
9984
9985 return Val;
9986}
9987
9988SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9989 EVT VT = Op.getValueType();
9990 assert(VT.isScalableVector() ||
9992 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9993
9994 SDLoc DL(Op);
9995 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9996 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9997}
9998
9999SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10000 SelectionDAG &DAG) const {
10001
10002 EVT VT = Op.getValueType();
10003 SDLoc DL(Op);
10004 unsigned Opcode = Op.getOpcode();
10006 switch (Opcode) {
10007 default:
10008 llvm_unreachable("Wrong instruction");
10009 case ISD::SMAX:
10010 CC = ISD::SETGT;
10011 break;
10012 case ISD::SMIN:
10013 CC = ISD::SETLT;
10014 break;
10015 case ISD::UMAX:
10016 CC = ISD::SETUGT;
10017 break;
10018 case ISD::UMIN:
10019 CC = ISD::SETULT;
10020 break;
10021 }
10022
10023 if (VT.isScalableVector() ||
10025 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10026 switch (Opcode) {
10027 default:
10028 llvm_unreachable("Wrong instruction");
10029 case ISD::SMAX:
10030 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10031 case ISD::SMIN:
10032 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10033 case ISD::UMAX:
10034 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10035 case ISD::UMIN:
10036 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10037 }
10038 }
10039
10040 SDValue Op0 = Op.getOperand(0);
10041 SDValue Op1 = Op.getOperand(1);
10042 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10043 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10044}
10045
10046SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10047 SelectionDAG &DAG) const {
10048 EVT VT = Op.getValueType();
10049
10050 if (VT.isScalableVector() ||
10052 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10053 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10054
10055 SDLoc DL(Op);
10056 SDValue REVB;
10057 MVT VST;
10058
10059 switch (VT.getSimpleVT().SimpleTy) {
10060 default:
10061 llvm_unreachable("Invalid type for bitreverse!");
10062
10063 case MVT::v2i32: {
10064 VST = MVT::v8i8;
10065 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10066
10067 break;
10068 }
10069
10070 case MVT::v4i32: {
10071 VST = MVT::v16i8;
10072 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10073
10074 break;
10075 }
10076
10077 case MVT::v1i64: {
10078 VST = MVT::v8i8;
10079 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10080
10081 break;
10082 }
10083
10084 case MVT::v2i64: {
10085 VST = MVT::v16i8;
10086 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10087
10088 break;
10089 }
10090 }
10091
10092 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10093 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10094}
10095
10096// Check whether the continuous comparison sequence.
10097static bool
10098isOrXorChain(SDValue N, unsigned &Num,
10099 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10100 if (Num == MaxXors)
10101 return false;
10102
10103 // Skip the one-use zext
10104 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10105 N = N->getOperand(0);
10106
10107 // The leaf node must be XOR
10108 if (N->getOpcode() == ISD::XOR) {
10109 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10110 Num++;
10111 return true;
10112 }
10113
10114 // All the non-leaf nodes must be OR.
10115 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10116 return false;
10117
10118 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10119 isOrXorChain(N->getOperand(1), Num, WorkList))
10120 return true;
10121 return false;
10122}
10123
10124// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10126 SDValue LHS = N->getOperand(0);
10127 SDValue RHS = N->getOperand(1);
10128 SDLoc DL(N);
10129 EVT VT = N->getValueType(0);
10131
10132 // Only handle integer compares.
10133 if (N->getOpcode() != ISD::SETCC)
10134 return SDValue();
10135
10136 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10137 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10138 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10139 unsigned NumXors = 0;
10140 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10141 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10142 isOrXorChain(LHS, NumXors, WorkList)) {
10143 SDValue XOR0, XOR1;
10144 std::tie(XOR0, XOR1) = WorkList[0];
10145 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10146 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
10147 for (unsigned I = 1; I < WorkList.size(); I++) {
10148 std::tie(XOR0, XOR1) = WorkList[I];
10149 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
10150 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
10151 }
10152
10153 // Exit early by inverting the condition, which help reduce indentations.
10154 return Cmp;
10155 }
10156
10157 return SDValue();
10158}
10159
10160SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10161
10162 if (Op.getValueType().isVector())
10163 return LowerVSETCC(Op, DAG);
10164
10165 bool IsStrict = Op->isStrictFPOpcode();
10166 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10167 unsigned OpNo = IsStrict ? 1 : 0;
10168 SDValue Chain;
10169 if (IsStrict)
10170 Chain = Op.getOperand(0);
10171 SDValue LHS = Op.getOperand(OpNo + 0);
10172 SDValue RHS = Op.getOperand(OpNo + 1);
10173 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
10174 SDLoc dl(Op);
10175
10176 // We chose ZeroOrOneBooleanContents, so use zero and one.
10177 EVT VT = Op.getValueType();
10178 SDValue TVal = DAG.getConstant(1, dl, VT);
10179 SDValue FVal = DAG.getConstant(0, dl, VT);
10180
10181 // Handle f128 first, since one possible outcome is a normal integer
10182 // comparison which gets picked up by the next if statement.
10183 if (LHS.getValueType() == MVT::f128) {
10184 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
10185 IsSignaling);
10186
10187 // If softenSetCCOperands returned a scalar, use it.
10188 if (!RHS.getNode()) {
10189 assert(LHS.getValueType() == Op.getValueType() &&
10190 "Unexpected setcc expansion!");
10191 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
10192 }
10193 }
10194
10195 if (LHS.getValueType().isInteger()) {
10196 SDValue CCVal;
10198 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
10199
10200 // Note that we inverted the condition above, so we reverse the order of
10201 // the true and false operands here. This will allow the setcc to be
10202 // matched to a single CSINC instruction.
10203 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
10204 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
10205 }
10206
10207 // Now we know we're dealing with FP values.
10208 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
10209 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10210
10211 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
10212 // and do the comparison.
10213 SDValue Cmp;
10214 if (IsStrict)
10215 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
10216 else
10217 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10218
10219 AArch64CC::CondCode CC1, CC2;
10220 changeFPCCToAArch64CC(CC, CC1, CC2);
10221 SDValue Res;
10222 if (CC2 == AArch64CC::AL) {
10223 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
10224 CC2);
10225 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10226
10227 // Note that we inverted the condition above, so we reverse the order of
10228 // the true and false operands here. This will allow the setcc to be
10229 // matched to a single CSINC instruction.
10230 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
10231 } else {
10232 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
10233 // totally clean. Some of them require two CSELs to implement. As is in
10234 // this case, we emit the first CSEL and then emit a second using the output
10235 // of the first as the RHS. We're effectively OR'ing the two CC's together.
10236
10237 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
10238 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10239 SDValue CS1 =
10240 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10241
10242 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10243 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10244 }
10245 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
10246}
10247
10248SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
10249 SelectionDAG &DAG) const {
10250
10251 SDValue LHS = Op.getOperand(0);
10252 SDValue RHS = Op.getOperand(1);
10253 EVT VT = LHS.getValueType();
10254 if (VT != MVT::i32 && VT != MVT::i64)
10255 return SDValue();
10256
10257 SDLoc DL(Op);
10258 SDValue Carry = Op.getOperand(2);
10259 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
10260 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
10261 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
10262 LHS, RHS, InvCarry);
10263
10264 EVT OpVT = Op.getValueType();
10265 SDValue TVal = DAG.getConstant(1, DL, OpVT);
10266 SDValue FVal = DAG.getConstant(0, DL, OpVT);
10267
10268 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10270 SDValue CCVal =
10271 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
10272 // Inputs are swapped because the condition is inverted. This will allow
10273 // matching with a single CSINC instruction.
10274 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
10275 Cmp.getValue(1));
10276}
10277
10278SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
10279 SDValue RHS, SDValue TVal,
10280 SDValue FVal, const SDLoc &dl,
10281 SelectionDAG &DAG) const {
10282 // Handle f128 first, because it will result in a comparison of some RTLIB
10283 // call result against zero.
10284 if (LHS.getValueType() == MVT::f128) {
10285 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10286
10287 // If softenSetCCOperands returned a scalar, we need to compare the result
10288 // against zero to select between true and false values.
10289 if (!RHS.getNode()) {
10290 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10291 CC = ISD::SETNE;
10292 }
10293 }
10294
10295 // Also handle f16, for which we need to do a f32 comparison.
10296 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
10297 LHS.getValueType() == MVT::bf16) {
10298 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
10299 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
10300 }
10301
10302 // Next, handle integers.
10303 if (LHS.getValueType().isInteger()) {
10304 assert((LHS.getValueType() == RHS.getValueType()) &&
10305 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10306
10307 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
10308 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
10309 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10310 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
10311 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
10312 // supported types.
10313 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
10314 CTVal->isOne() && CFVal->isAllOnes() &&
10315 LHS.getValueType() == TVal.getValueType()) {
10316 EVT VT = LHS.getValueType();
10317 SDValue Shift =
10318 DAG.getNode(ISD::SRA, dl, VT, LHS,
10319 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
10320 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
10321 }
10322
10323 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
10324 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
10325 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
10326 // Both require less instructions than compare and conditional select.
10327 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
10328 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
10329 LHS.getValueType() == RHS.getValueType()) {
10330 EVT VT = LHS.getValueType();
10331 SDValue Shift =
10332 DAG.getNode(ISD::SRA, dl, VT, LHS,
10333 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
10334
10335 if (CC == ISD::SETGT)
10336 Shift = DAG.getNOT(dl, Shift, VT);
10337
10338 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
10339 }
10340
10341 unsigned Opcode = AArch64ISD::CSEL;
10342
10343 // If both the TVal and the FVal are constants, see if we can swap them in
10344 // order to for a CSINV or CSINC out of them.
10345 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
10346 std::swap(TVal, FVal);
10347 std::swap(CTVal, CFVal);
10348 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10349 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
10350 std::swap(TVal, FVal);
10351 std::swap(CTVal, CFVal);
10352 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10353 } else if (TVal.getOpcode() == ISD::XOR) {
10354 // If TVal is a NOT we want to swap TVal and FVal so that we can match
10355 // with a CSINV rather than a CSEL.
10356 if (isAllOnesConstant(TVal.getOperand(1))) {
10357 std::swap(TVal, FVal);
10358 std::swap(CTVal, CFVal);
10359 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10360 }
10361 } else if (TVal.getOpcode() == ISD::SUB) {
10362 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
10363 // that we can match with a CSNEG rather than a CSEL.
10364 if (isNullConstant(TVal.getOperand(0))) {
10365 std::swap(TVal, FVal);
10366 std::swap(CTVal, CFVal);
10367 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10368 }
10369 } else if (CTVal && CFVal) {
10370 const int64_t TrueVal = CTVal->getSExtValue();
10371 const int64_t FalseVal = CFVal->getSExtValue();
10372 bool Swap = false;
10373
10374 // If both TVal and FVal are constants, see if FVal is the
10375 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
10376 // instead of a CSEL in that case.
10377 if (TrueVal == ~FalseVal) {
10378 Opcode = AArch64ISD::CSINV;
10379 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
10380 TrueVal == -FalseVal) {
10381 Opcode = AArch64ISD::CSNEG;
10382 } else if (TVal.getValueType() == MVT::i32) {
10383 // If our operands are only 32-bit wide, make sure we use 32-bit
10384 // arithmetic for the check whether we can use CSINC. This ensures that
10385 // the addition in the check will wrap around properly in case there is
10386 // an overflow (which would not be the case if we do the check with
10387 // 64-bit arithmetic).
10388 const uint32_t TrueVal32 = CTVal->getZExtValue();
10389 const uint32_t FalseVal32 = CFVal->getZExtValue();
10390
10391 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
10392 Opcode = AArch64ISD::CSINC;
10393
10394 if (TrueVal32 > FalseVal32) {
10395 Swap = true;
10396 }
10397 }
10398 } else {
10399 // 64-bit check whether we can use CSINC.
10400 const uint64_t TrueVal64 = TrueVal;
10401 const uint64_t FalseVal64 = FalseVal;
10402
10403 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
10404 Opcode = AArch64ISD::CSINC;
10405
10406 if (TrueVal > FalseVal) {
10407 Swap = true;
10408 }
10409 }
10410 }
10411
10412 // Swap TVal and FVal if necessary.
10413 if (Swap) {
10414 std::swap(TVal, FVal);
10415 std::swap(CTVal, CFVal);
10416 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10417 }
10418
10419 if (Opcode != AArch64ISD::CSEL) {
10420 // Drop FVal since we can get its value by simply inverting/negating
10421 // TVal.
10422 FVal = TVal;
10423 }
10424 }
10425
10426 // Avoid materializing a constant when possible by reusing a known value in
10427 // a register. However, don't perform this optimization if the known value
10428 // is one, zero or negative one in the case of a CSEL. We can always
10429 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10430 // FVal, respectively.
10431 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
10432 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10433 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10435 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10436 // "a != C ? x : a" to avoid materializing C.
10437 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10438 TVal = LHS;
10439 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10440 FVal = LHS;
10441 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10442 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10443 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10444 // avoid materializing C.
10446 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10447 Opcode = AArch64ISD::CSINV;
10448 TVal = LHS;
10449 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10450 }
10451 }
10452
10453 SDValue CCVal;
10454 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10455 EVT VT = TVal.getValueType();
10456 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10457 }
10458
10459 // Now we know we're dealing with FP values.
10460 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10461 LHS.getValueType() == MVT::f64);
10462 assert(LHS.getValueType() == RHS.getValueType());
10463 EVT VT = TVal.getValueType();
10464 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10465
10466 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10467 // clean. Some of them require two CSELs to implement.
10468 AArch64CC::CondCode CC1, CC2;
10469 changeFPCCToAArch64CC(CC, CC1, CC2);
10470
10471 if (DAG.getTarget().Options.UnsafeFPMath) {
10472 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10473 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10474 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10475 if (RHSVal && RHSVal->isZero()) {
10476 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10477 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10478
10479 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10480 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10481 TVal = LHS;
10482 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10483 CFVal && CFVal->isZero() &&
10484 FVal.getValueType() == LHS.getValueType())
10485 FVal = LHS;
10486 }
10487 }
10488
10489 // Emit first, and possibly only, CSEL.
10490 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10491 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10492
10493 // If we need a second CSEL, emit it, using the output of the first as the
10494 // RHS. We're effectively OR'ing the two CC's together.
10495 if (CC2 != AArch64CC::AL) {
10496 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10497 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10498 }
10499
10500 // Otherwise, return the output of the first CSEL.
10501 return CS1;
10502}
10503
10504SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10505 SelectionDAG &DAG) const {
10506 EVT Ty = Op.getValueType();
10507 auto Idx = Op.getConstantOperandAPInt(2);
10508 int64_t IdxVal = Idx.getSExtValue();
10509 assert(Ty.isScalableVector() &&
10510 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10511
10512 // We can use the splice instruction for certain index values where we are
10513 // able to efficiently generate the correct predicate. The index will be
10514 // inverted and used directly as the input to the ptrue instruction, i.e.
10515 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10516 // splice predicate. However, we can only do this if we can guarantee that
10517 // there are enough elements in the vector, hence we check the index <= min
10518 // number of elements.
10519 std::optional<unsigned> PredPattern;
10520 if (Ty.isScalableVector() && IdxVal < 0 &&
10521 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10522 std::nullopt) {
10523 SDLoc DL(Op);
10524
10525 // Create a predicate where all but the last -IdxVal elements are false.
10526 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10527 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10528 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10529
10530 // Now splice the two inputs together using the predicate.
10531 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10532 Op.getOperand(1));
10533 }
10534
10535 // We can select to an EXT instruction when indexing the first 256 bytes.
10537 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
10538 return Op;
10539
10540 return SDValue();
10541}
10542
10543SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10544 SelectionDAG &DAG) const {
10545 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10546 SDValue LHS = Op.getOperand(0);
10547 SDValue RHS = Op.getOperand(1);
10548 SDValue TVal = Op.getOperand(2);
10549 SDValue FVal = Op.getOperand(3);
10550 SDLoc DL(Op);
10551 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10552}
10553
10554SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10555 SelectionDAG &DAG) const {
10556 SDValue CCVal = Op->getOperand(0);
10557 SDValue TVal = Op->getOperand(1);
10558 SDValue FVal = Op->getOperand(2);
10559 SDLoc DL(Op);
10560
10561 EVT Ty = Op.getValueType();
10562 if (Ty == MVT::aarch64svcount) {
10563 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10564 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10565 SDValue Sel =
10566 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10567 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10568 }
10569
10570 if (Ty.isScalableVector()) {
10571 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10572 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10573 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10574 }
10575
10576 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10577 // FIXME: Ideally this would be the same as above using i1 types, however
10578 // for the moment we can't deal with fixed i1 vector types properly, so
10579 // instead extend the predicate to a result type sized integer vector.
10580 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10581 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10582 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10583 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10584 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10585 }
10586
10587 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10588 // instruction.
10589 if (ISD::isOverflowIntrOpRes(CCVal)) {
10590 // Only lower legal XALUO ops.
10591 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10592 return SDValue();
10593
10595 SDValue Value, Overflow;
10596 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10597 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10598
10599 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10600 CCVal, Overflow);
10601 }
10602
10603 // Lower it the same way as we would lower a SELECT_CC node.
10605 SDValue LHS, RHS;
10606 if (CCVal.getOpcode() == ISD::SETCC) {
10607 LHS = CCVal.getOperand(0);
10608 RHS = CCVal.getOperand(1);
10609 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10610 } else {
10611 LHS = CCVal;
10612 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10613 CC = ISD::SETNE;
10614 }
10615
10616 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10617 // order to use FCSELSrrr
10618 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10619 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10620 DAG.getUNDEF(MVT::f32), TVal);
10621 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10622 DAG.getUNDEF(MVT::f32), FVal);
10623 }
10624
10625 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10626
10627 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10628 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10629 }
10630
10631 return Res;
10632}
10633
10634SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10635 SelectionDAG &DAG) const {
10636 // Jump table entries as PC relative offsets. No additional tweaking
10637 // is necessary here. Just get the address of the jump table.
10638 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10639
10642 !Subtarget->isTargetMachO())
10643 return getAddrLarge(JT, DAG);
10644 if (CM == CodeModel::Tiny)
10645 return getAddrTiny(JT, DAG);
10646 return getAddr(JT, DAG);
10647}
10648
10649SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10650 SelectionDAG &DAG) const {
10651 // Jump table entries as PC relative offsets. No additional tweaking
10652 // is necessary here. Just get the address of the jump table.
10653 SDLoc DL(Op);
10654 SDValue JT = Op.getOperand(1);
10655 SDValue Entry = Op.getOperand(2);
10656 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10657
10658 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10659 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10660
10661 SDNode *Dest =
10662 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10663 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10664 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10665 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10666}
10667
10668SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10669 SelectionDAG &DAG) const {
10670 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10672 if (CM == CodeModel::Large) {
10673 // Use the GOT for the large code model on iOS.
10674 if (Subtarget->isTargetMachO()) {
10675 return getGOT(CP, DAG);
10676 }
10678 return getAddrLarge(CP, DAG);
10679 } else if (CM == CodeModel::Tiny) {
10680 return getAddrTiny(CP, DAG);
10681 }
10682 return getAddr(CP, DAG);
10683}
10684
10685SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10686 SelectionDAG &DAG) const {
10687 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10689 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10691 return getAddrLarge(BA, DAG);
10692 } else if (CM == CodeModel::Tiny) {
10693 return getAddrTiny(BA, DAG);
10694 }
10695 return getAddr(BA, DAG);
10696}
10697
10698SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10699 SelectionDAG &DAG) const {
10700 AArch64FunctionInfo *FuncInfo =
10702
10703 SDLoc DL(Op);
10704 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10706 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10707 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10708 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10709 MachinePointerInfo(SV));
10710}
10711
10712SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10713 SelectionDAG &DAG) const {
10716
10717 SDLoc DL(Op);
10718 SDValue FR;
10719 if (Subtarget->isWindowsArm64EC()) {
10720 // With the Arm64EC ABI, we compute the address of the varargs save area
10721 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10722 // but calls from an entry thunk can pass in a different address.
10723 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10724 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10726 if (FuncInfo->getVarArgsGPRSize() > 0)
10727 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10728 else
10729 StackOffset = FuncInfo->getVarArgsStackOffset();
10730 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10731 DAG.getConstant(StackOffset, DL, MVT::i64));
10732 } else {
10733 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10734 ? FuncInfo->getVarArgsGPRIndex()
10735 : FuncInfo->getVarArgsStackIndex(),
10737 }
10738 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10739 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10740 MachinePointerInfo(SV));
10741}
10742
10743SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10744 SelectionDAG &DAG) const {
10745 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10746 // Standard, section B.3.
10749 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10750 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10751 auto PtrVT = getPointerTy(DAG.getDataLayout());
10752 SDLoc DL(Op);
10753
10754 SDValue Chain = Op.getOperand(0);
10755 SDValue VAList = Op.getOperand(1);
10756 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10758
10759 // void *__stack at offset 0
10760 unsigned Offset = 0;
10761 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10762 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10763 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10764 MachinePointerInfo(SV), Align(PtrSize)));
10765
10766 // void *__gr_top at offset 8 (4 on ILP32)
10767 Offset += PtrSize;
10768 int GPRSize = FuncInfo->getVarArgsGPRSize();
10769 if (GPRSize > 0) {
10770 SDValue GRTop, GRTopAddr;
10771
10772 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10773 DAG.getConstant(Offset, DL, PtrVT));
10774
10775 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10776 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10777 DAG.getConstant(GPRSize, DL, PtrVT));
10778 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10779
10780 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10782 Align(PtrSize)));
10783 }
10784
10785 // void *__vr_top at offset 16 (8 on ILP32)
10786 Offset += PtrSize;
10787 int FPRSize = FuncInfo->getVarArgsFPRSize();
10788 if (FPRSize > 0) {
10789 SDValue VRTop, VRTopAddr;
10790 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10791 DAG.getConstant(Offset, DL, PtrVT));
10792
10793 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10794 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10795 DAG.getConstant(FPRSize, DL, PtrVT));
10796 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10797
10798 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10800 Align(PtrSize)));
10801 }
10802
10803 // int __gr_offs at offset 24 (12 on ILP32)
10804 Offset += PtrSize;
10805 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10806 DAG.getConstant(Offset, DL, PtrVT));
10807 MemOps.push_back(
10808 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10809 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10810
10811 // int __vr_offs at offset 28 (16 on ILP32)
10812 Offset += 4;
10813 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10814 DAG.getConstant(Offset, DL, PtrVT));
10815 MemOps.push_back(
10816 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10817 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10818
10819 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10820}
10821
10822SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10823 SelectionDAG &DAG) const {
10825
10826 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10827 return LowerWin64_VASTART(Op, DAG);
10828 else if (Subtarget->isTargetDarwin())
10829 return LowerDarwin_VASTART(Op, DAG);
10830 else
10831 return LowerAAPCS_VASTART(Op, DAG);
10832}
10833
10834SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10835 SelectionDAG &DAG) const {
10836 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10837 // pointer.
10838 SDLoc DL(Op);
10839 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10840 unsigned VaListSize =
10841 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10842 ? PtrSize
10843 : Subtarget->isTargetILP32() ? 20 : 32;
10844 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10845 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10846
10847 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10848 DAG.getConstant(VaListSize, DL, MVT::i32),
10849 Align(PtrSize), false, false, false,
10850 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10851}
10852
10853SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10854 assert(Subtarget->isTargetDarwin() &&
10855 "automatic va_arg instruction only works on Darwin");
10856
10857 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10858 EVT VT = Op.getValueType();
10859 SDLoc DL(Op);
10860 SDValue Chain = Op.getOperand(0);
10861 SDValue Addr = Op.getOperand(1);
10862 MaybeAlign Align(Op.getConstantOperandVal(3));
10863 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10864 auto PtrVT = getPointerTy(DAG.getDataLayout());
10865 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10866 SDValue VAList =
10867 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10868 Chain = VAList.getValue(1);
10869 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10870
10871 if (VT.isScalableVector())
10872 report_fatal_error("Passing SVE types to variadic functions is "
10873 "currently not supported");
10874
10875 if (Align && *Align > MinSlotSize) {
10876 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10877 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10878 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10879 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10880 }
10881
10882 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10883 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10884
10885 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10886 // up to 64 bits. At the very least, we have to increase the striding of the
10887 // vaargs list to match this, and for FP values we need to introduce
10888 // FP_ROUND nodes as well.
10889 if (VT.isInteger() && !VT.isVector())
10890 ArgSize = std::max(ArgSize, MinSlotSize);
10891 bool NeedFPTrunc = false;
10892 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10893 ArgSize = 8;
10894 NeedFPTrunc = true;
10895 }
10896
10897 // Increment the pointer, VAList, to the next vaarg
10898 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10899 DAG.getConstant(ArgSize, DL, PtrVT));
10900 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10901
10902 // Store the incremented VAList to the legalized pointer
10903 SDValue APStore =
10904 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10905
10906 // Load the actual argument out of the pointer VAList
10907 if (NeedFPTrunc) {
10908 // Load the value as an f64.
10909 SDValue WideFP =
10910 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10911 // Round the value down to an f32.
10912 SDValue NarrowFP =
10913 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10914 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10915 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10916 // Merge the rounded value with the chain output of the load.
10917 return DAG.getMergeValues(Ops, DL);
10918 }
10919
10920 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10921}
10922
10923SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10924 SelectionDAG &DAG) const {
10926 MFI.setFrameAddressIsTaken(true);
10927
10928 EVT VT = Op.getValueType();
10929 SDLoc DL(Op);
10930 unsigned Depth = Op.getConstantOperandVal(0);
10931 SDValue FrameAddr =
10932 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10933 while (Depth--)
10934 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10936
10937 if (Subtarget->isTargetILP32())
10938 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10939 DAG.getValueType(VT));
10940
10941 return FrameAddr;
10942}
10943
10944SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10945 SelectionDAG &DAG) const {
10947
10948 EVT VT = getPointerTy(DAG.getDataLayout());
10949 SDLoc DL(Op);
10950 int FI = MFI.CreateFixedObject(4, 0, false);
10951 return DAG.getFrameIndex(FI, VT);
10952}
10953
10954#define GET_REGISTER_MATCHER
10955#include "AArch64GenAsmMatcher.inc"
10956
10957// FIXME? Maybe this could be a TableGen attribute on some registers and
10958// this table could be generated automatically from RegInfo.
10959Register AArch64TargetLowering::
10960getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10962 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10963 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10964 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10965 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10966 !MRI->isReservedReg(MF, Reg))
10967 Reg = 0;
10968 }
10969 if (Reg)
10970 return Reg;
10971 report_fatal_error(Twine("Invalid register name \""
10972 + StringRef(RegName) + "\"."));
10973}
10974
10975SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10976 SelectionDAG &DAG) const {
10978
10979 EVT VT = Op.getValueType();
10980 SDLoc DL(Op);
10981
10982 SDValue FrameAddr =
10983 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10985
10986 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10987}
10988
10989SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10990 SelectionDAG &DAG) const {
10992 MachineFrameInfo &MFI = MF.getFrameInfo();
10993 MFI.setReturnAddressIsTaken(true);
10994
10995 EVT VT = Op.getValueType();
10996 SDLoc DL(Op);
10997 unsigned Depth = Op.getConstantOperandVal(0);
10998 SDValue ReturnAddress;
10999 if (Depth) {
11000 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11002 ReturnAddress = DAG.getLoad(
11003 VT, DL, DAG.getEntryNode(),
11004 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11005 } else {
11006 // Return LR, which contains the return address. Mark it an implicit
11007 // live-in.
11008 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11009 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11010 }
11011
11012 // The XPACLRI instruction assembles to a hint-space instruction before
11013 // Armv8.3-A therefore this instruction can be safely used for any pre
11014 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11015 // that instead.
11016 SDNode *St;
11017 if (Subtarget->hasPAuth()) {
11018 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11019 } else {
11020 // XPACLRI operates on LR therefore we must move the operand accordingly.
11021 SDValue Chain =
11022 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11023 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11024 }
11025 return SDValue(St, 0);
11026}
11027
11028/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11029/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11030SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11031 SelectionDAG &DAG) const {
11032 SDValue Lo, Hi;
11033 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11034 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11035}
11036
11038 const GlobalAddressSDNode *GA) const {
11039 // Offsets are folded in the DAG combine rather than here so that we can
11040 // intelligently choose an offset based on the uses.
11041 return false;
11042}
11043
11045 bool OptForSize) const {
11046 bool IsLegal = false;
11047 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11048 // 16-bit case when target has full fp16 support.
11049 // We encode bf16 bit patterns as if they were fp16. This results in very
11050 // strange looking assembly but should populate the register with appropriate
11051 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11052 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11053 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11054 // FIXME: We should be able to handle f128 as well with a clever lowering.
11055 const APInt ImmInt = Imm.bitcastToAPInt();
11056 if (VT == MVT::f64)
11057 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11058 else if (VT == MVT::f32)
11059 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11060 else if (VT == MVT::f16 || VT == MVT::bf16)
11061 IsLegal =
11062 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11063 Imm.isPosZero();
11064
11065 // If we can not materialize in immediate field for fmov, check if the
11066 // value can be encoded as the immediate operand of a logical instruction.
11067 // The immediate value will be created with either MOVZ, MOVN, or ORR.
11068 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
11069 // generate that fmov.
11070 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
11071 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
11072 // however the mov+fmov sequence is always better because of the reduced
11073 // cache pressure. The timings are still the same if you consider
11074 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
11075 // movw+movk is fused). So we limit up to 2 instrdduction at most.
11078 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
11079 IsLegal = Insn.size() <= Limit;
11080 }
11081
11082 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
11083 << " imm value: "; Imm.dump(););
11084 return IsLegal;
11085}
11086
11087//===----------------------------------------------------------------------===//
11088// AArch64 Optimization Hooks
11089//===----------------------------------------------------------------------===//
11090
11091static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
11092 SDValue Operand, SelectionDAG &DAG,
11093 int &ExtraSteps) {
11094 EVT VT = Operand.getValueType();
11095 if ((ST->hasNEON() &&
11096 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
11097 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
11098 VT == MVT::v4f32)) ||
11099 (ST->hasSVE() &&
11100 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
11102 // For the reciprocal estimates, convergence is quadratic, so the number
11103 // of digits is doubled after each iteration. In ARMv8, the accuracy of
11104 // the initial estimate is 2^-8. Thus the number of extra steps to refine
11105 // the result for float (23 mantissa bits) is 2 and for double (52
11106 // mantissa bits) is 3.
11107 constexpr unsigned AccurateBits = 8;
11108 unsigned DesiredBits =
11110 ExtraSteps = DesiredBits <= AccurateBits
11111 ? 0
11112 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
11113 }
11114
11115 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
11116 }
11117
11118 return SDValue();
11119}
11120
11121SDValue
11122AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
11123 const DenormalMode &Mode) const {
11124 SDLoc DL(Op);
11125 EVT VT = Op.getValueType();
11126 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
11127 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
11128 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
11129}
11130
11131SDValue
11132AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
11133 SelectionDAG &DAG) const {
11134 return Op;
11135}
11136
11137SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
11138 SelectionDAG &DAG, int Enabled,
11139 int &ExtraSteps,
11140 bool &UseOneConst,
11141 bool Reciprocal) const {
11143 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
11144 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
11145 DAG, ExtraSteps)) {
11146 SDLoc DL(Operand);
11147 EVT VT = Operand.getValueType();
11148
11150 Flags.setAllowReassociation(true);
11151
11152 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
11153 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
11154 for (int i = ExtraSteps; i > 0; --i) {
11155 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
11156 Flags);
11157 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
11158 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
11159 }
11160 if (!Reciprocal)
11161 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
11162
11163 ExtraSteps = 0;
11164 return Estimate;
11165 }
11166
11167 return SDValue();
11168}
11169
11170SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
11171 SelectionDAG &DAG, int Enabled,
11172 int &ExtraSteps) const {
11174 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
11175 DAG, ExtraSteps)) {
11176 SDLoc DL(Operand);
11177 EVT VT = Operand.getValueType();
11178
11180 Flags.setAllowReassociation(true);
11181
11182 // Newton reciprocal iteration: E * (2 - X * E)
11183 // AArch64 reciprocal iteration instruction: (2 - M * N)
11184 for (int i = ExtraSteps; i > 0; --i) {
11185 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
11186 Estimate, Flags);
11187 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
11188 }
11189
11190 ExtraSteps = 0;
11191 return Estimate;
11192 }
11193
11194 return SDValue();
11195}
11196
11197//===----------------------------------------------------------------------===//
11198// AArch64 Inline Assembly Support
11199//===----------------------------------------------------------------------===//
11200
11201// Table of Constraints
11202// TODO: This is the current set of constraints supported by ARM for the
11203// compiler, not all of them may make sense.
11204//
11205// r - A general register
11206// w - An FP/SIMD register of some size in the range v0-v31
11207// x - An FP/SIMD register of some size in the range v0-v15
11208// I - Constant that can be used with an ADD instruction
11209// J - Constant that can be used with a SUB instruction
11210// K - Constant that can be used with a 32-bit logical instruction
11211// L - Constant that can be used with a 64-bit logical instruction
11212// M - Constant that can be used as a 32-bit MOV immediate
11213// N - Constant that can be used as a 64-bit MOV immediate
11214// Q - A memory reference with base register and no offset
11215// S - A symbolic address
11216// Y - Floating point constant zero
11217// Z - Integer constant zero
11218//
11219// Note that general register operands will be output using their 64-bit x
11220// register name, whatever the size of the variable, unless the asm operand
11221// is prefixed by the %w modifier. Floating-point and SIMD register operands
11222// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
11223// %q modifier.
11224const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
11225 // At this point, we have to lower this constraint to something else, so we
11226 // lower it to an "r" or "w". However, by doing this we will force the result
11227 // to be in register, while the X constraint is much more permissive.
11228 //
11229 // Although we are correct (we are free to emit anything, without
11230 // constraints), we might break use cases that would expect us to be more
11231 // efficient and emit something else.
11232 if (!Subtarget->hasFPARMv8())
11233 return "r";
11234
11235 if (ConstraintVT.isFloatingPoint())
11236 return "w";
11237
11238 if (ConstraintVT.isVector() &&
11239 (ConstraintVT.getSizeInBits() == 64 ||
11240 ConstraintVT.getSizeInBits() == 128))
11241 return "w";
11242
11243 return "r";
11244}
11245
11247
11248static std::optional<PredicateConstraint>
11251 .Case("Uph", PredicateConstraint::Uph)
11252 .Case("Upl", PredicateConstraint::Upl)
11253 .Case("Upa", PredicateConstraint::Upa)
11254 .Default(std::nullopt);
11255}
11256
11257static const TargetRegisterClass *
11259 if (VT != MVT::aarch64svcount &&
11260 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
11261 return nullptr;
11262
11263 switch (Constraint) {
11264 case PredicateConstraint::Uph:
11265 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
11266 : &AArch64::PPR_p8to15RegClass;
11267 case PredicateConstraint::Upl:
11268 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
11269 : &AArch64::PPR_3bRegClass;
11270 case PredicateConstraint::Upa:
11271 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
11272 : &AArch64::PPRRegClass;
11273 }
11274
11275 llvm_unreachable("Missing PredicateConstraint!");
11276}
11277
11279
11280static std::optional<ReducedGprConstraint>
11283 .Case("Uci", ReducedGprConstraint::Uci)
11284 .Case("Ucj", ReducedGprConstraint::Ucj)
11285 .Default(std::nullopt);
11286}
11287
11288static const TargetRegisterClass *
11290 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
11291 return nullptr;
11292
11293 switch (Constraint) {
11294 case ReducedGprConstraint::Uci:
11295 return &AArch64::MatrixIndexGPR32_8_11RegClass;
11296 case ReducedGprConstraint::Ucj:
11297 return &AArch64::MatrixIndexGPR32_12_15RegClass;
11298 }
11299
11300 llvm_unreachable("Missing ReducedGprConstraint!");
11301}
11302
11303// The set of cc code supported is from
11304// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
11307 .Case("{@cchi}", AArch64CC::HI)
11308 .Case("{@cccs}", AArch64CC::HS)
11309 .Case("{@cclo}", AArch64CC::LO)
11310 .Case("{@ccls}", AArch64CC::LS)
11311 .Case("{@cccc}", AArch64CC::LO)
11312 .Case("{@cceq}", AArch64CC::EQ)
11313 .Case("{@ccgt}", AArch64CC::GT)
11314 .Case("{@ccge}", AArch64CC::GE)
11315 .Case("{@cclt}", AArch64CC::LT)
11316 .Case("{@ccle}", AArch64CC::LE)
11317 .Case("{@cchs}", AArch64CC::HS)
11318 .Case("{@ccne}", AArch64CC::NE)
11319 .Case("{@ccvc}", AArch64CC::VC)
11320 .Case("{@ccpl}", AArch64CC::PL)
11321 .Case("{@ccvs}", AArch64CC::VS)
11322 .Case("{@ccmi}", AArch64CC::MI)
11324 return Cond;
11325}
11326
11327/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
11328/// WZR, invert(<cond>)'.
11330 SelectionDAG &DAG) {
11331 return DAG.getNode(
11332 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
11333 DAG.getConstant(0, DL, MVT::i32),
11334 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
11335}
11336
11337// Lower @cc flag output via getSETCC.
11338SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
11339 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
11340 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
11341 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
11342 if (Cond == AArch64CC::Invalid)
11343 return SDValue();
11344 // The output variable should be a scalar integer.
11345 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
11346 OpInfo.ConstraintVT.getSizeInBits() < 8)
11347 report_fatal_error("Flag output operand is of invalid type");
11348
11349 // Get NZCV register. Only update chain when copyfrom is glued.
11350 if (Glue.getNode()) {
11351 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
11352 Chain = Glue.getValue(1);
11353 } else
11354 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
11355 // Extract CC code.
11356 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
11357
11359
11360 // Truncate or ZERO_EXTEND based on value types.
11361 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
11362 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
11363 else
11364 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
11365
11366 return Result;
11367}
11368
11369/// getConstraintType - Given a constraint letter, return the type of
11370/// constraint it is for this target.
11372AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
11373 if (Constraint.size() == 1) {
11374 switch (Constraint[0]) {
11375 default:
11376 break;
11377 case 'x':
11378 case 'w':
11379 case 'y':
11380 return C_RegisterClass;
11381 // An address with a single base register. Due to the way we
11382 // currently handle addresses it is the same as 'r'.
11383 case 'Q':
11384 return C_Memory;
11385 case 'I':
11386 case 'J':
11387 case 'K':
11388 case 'L':
11389 case 'M':
11390 case 'N':
11391 case 'Y':
11392 case 'Z':
11393 return C_Immediate;
11394 case 'z':
11395 case 'S': // A symbol or label reference with a constant offset
11396 return C_Other;
11397 }
11398 } else if (parsePredicateConstraint(Constraint))
11399 return C_RegisterClass;
11400 else if (parseReducedGprConstraint(Constraint))
11401 return C_RegisterClass;
11402 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11403 return C_Other;
11404 return TargetLowering::getConstraintType(Constraint);
11405}
11406
11407/// Examine constraint type and operand type and determine a weight value.
11408/// This object must already have been set up with the operand type
11409/// and the current alternative constraint selected.
11411AArch64TargetLowering::getSingleConstraintMatchWeight(
11412 AsmOperandInfo &info, const char *constraint) const {
11414 Value *CallOperandVal = info.CallOperandVal;
11415 // If we don't have a value, we can't do a match,
11416 // but allow it at the lowest weight.
11417 if (!CallOperandVal)
11418 return CW_Default;
11419 Type *type = CallOperandVal->getType();
11420 // Look at the constraint type.
11421 switch (*constraint) {
11422 default:
11424 break;
11425 case 'x':
11426 case 'w':
11427 case 'y':
11428 if (type->isFloatingPointTy() || type->isVectorTy())
11429 weight = CW_Register;
11430 break;
11431 case 'z':
11432 weight = CW_Constant;
11433 break;
11434 case 'U':
11435 if (parsePredicateConstraint(constraint) ||
11436 parseReducedGprConstraint(constraint))
11437 weight = CW_Register;
11438 break;
11439 }
11440 return weight;
11441}
11442
11443std::pair<unsigned, const TargetRegisterClass *>
11444AArch64TargetLowering::getRegForInlineAsmConstraint(
11445 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11446 if (Constraint.size() == 1) {
11447 switch (Constraint[0]) {
11448 case 'r':
11449 if (VT.isScalableVector())
11450 return std::make_pair(0U, nullptr);
11451 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11452 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11453 if (VT.getFixedSizeInBits() == 64)
11454 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11455 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11456 case 'w': {
11457 if (!Subtarget->hasFPARMv8())
11458 break;
11459 if (VT.isScalableVector()) {
11460 if (VT.getVectorElementType() != MVT::i1)
11461 return std::make_pair(0U, &AArch64::ZPRRegClass);
11462 return std::make_pair(0U, nullptr);
11463 }
11464 uint64_t VTSize = VT.getFixedSizeInBits();
11465 if (VTSize == 16)
11466 return std::make_pair(0U, &AArch64::FPR16RegClass);
11467 if (VTSize == 32)
11468 return std::make_pair(0U, &AArch64::FPR32RegClass);
11469 if (VTSize == 64)
11470 return std::make_pair(0U, &AArch64::FPR64RegClass);
11471 if (VTSize == 128)
11472 return std::make_pair(0U, &AArch64::FPR128RegClass);
11473 break;
11474 }
11475 // The instructions that this constraint is designed for can
11476 // only take 128-bit registers so just use that regclass.
11477 case 'x':
11478 if (!Subtarget->hasFPARMv8())
11479 break;
11480 if (VT.isScalableVector())
11481 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11482 if (VT.getSizeInBits() == 128)
11483 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11484 break;
11485 case 'y':
11486 if (!Subtarget->hasFPARMv8())
11487 break;
11488 if (VT.isScalableVector())
11489 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11490 break;
11491 }
11492 } else {
11493 if (const auto PC = parsePredicateConstraint(Constraint))
11494 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11495 return std::make_pair(0U, RegClass);
11496
11497 if (const auto RGC = parseReducedGprConstraint(Constraint))
11498 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11499 return std::make_pair(0U, RegClass);
11500 }
11501 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11503 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11504
11505 if (Constraint == "{za}") {
11506 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11507 }
11508
11509 if (Constraint == "{zt0}") {
11510 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11511 }
11512
11513 // Use the default implementation in TargetLowering to convert the register
11514 // constraint into a member of a register class.
11515 std::pair<unsigned, const TargetRegisterClass *> Res;
11517
11518 // Not found as a standard register?
11519 if (!Res.second) {
11520 unsigned Size = Constraint.size();
11521 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11522 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11523 int RegNo;
11524 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11525 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11526 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11527 // By default we'll emit v0-v31 for this unless there's a modifier where
11528 // we'll emit the correct register as well.
11529 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11530 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11531 Res.second = &AArch64::FPR64RegClass;
11532 } else {
11533 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11534 Res.second = &AArch64::FPR128RegClass;
11535 }
11536 }
11537 }
11538 }
11539
11540 if (Res.second && !Subtarget->hasFPARMv8() &&
11541 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11542 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11543 return std::make_pair(0U, nullptr);
11544
11545 return Res;
11546}
11547
11549 llvm::Type *Ty,
11550 bool AllowUnknown) const {
11551 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11552 return EVT(MVT::i64x8);
11553
11554 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11555}
11556
11557/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11558/// vector. If it is invalid, don't add anything to Ops.
11559void AArch64TargetLowering::LowerAsmOperandForConstraint(
11560 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11561 SelectionDAG &DAG) const {
11562 SDValue Result;
11563
11564 // Currently only support length 1 constraints.
11565 if (Constraint.size() != 1)
11566 return;
11567
11568 char ConstraintLetter = Constraint[0];
11569 switch (ConstraintLetter) {
11570 default:
11571 break;
11572
11573 // This set of constraints deal with valid constants for various instructions.
11574 // Validate and return a target constant for them if we can.
11575 case 'z': {
11576 // 'z' maps to xzr or wzr so it needs an input of 0.
11577 if (!isNullConstant(Op))
11578 return;
11579
11580 if (Op.getValueType() == MVT::i64)
11581 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11582 else
11583 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11584 break;
11585 }
11586 case 'S':
11587 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11588 // supported for PIC while "s" isn't, making "s" less useful. We implement
11589 // "S" but not "s".
11591 break;
11592
11593 case 'I':
11594 case 'J':
11595 case 'K':
11596 case 'L':
11597 case 'M':
11598 case 'N':
11599 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11600 if (!C)
11601 return;
11602
11603 // Grab the value and do some validation.
11604 uint64_t CVal = C->getZExtValue();
11605 switch (ConstraintLetter) {
11606 // The I constraint applies only to simple ADD or SUB immediate operands:
11607 // i.e. 0 to 4095 with optional shift by 12
11608 // The J constraint applies only to ADD or SUB immediates that would be
11609 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11610 // instruction [or vice versa], in other words -1 to -4095 with optional
11611 // left shift by 12.
11612 case 'I':
11613 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11614 break;
11615 return;
11616 case 'J': {
11617 uint64_t NVal = -C->getSExtValue();
11618 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11619 CVal = C->getSExtValue();
11620 break;
11621 }
11622 return;
11623 }
11624 // The K and L constraints apply *only* to logical immediates, including
11625 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11626 // been removed and MOV should be used). So these constraints have to
11627 // distinguish between bit patterns that are valid 32-bit or 64-bit
11628 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11629 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11630 // versa.
11631 case 'K':
11632 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11633 break;
11634 return;
11635 case 'L':
11636 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11637 break;
11638 return;
11639 // The M and N constraints are a superset of K and L respectively, for use
11640 // with the MOV (immediate) alias. As well as the logical immediates they
11641 // also match 32 or 64-bit immediates that can be loaded either using a
11642 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11643 // (M) or 64-bit 0x1234000000000000 (N) etc.
11644 // As a note some of this code is liberally stolen from the asm parser.
11645 case 'M': {
11646 if (!isUInt<32>(CVal))
11647 return;
11648 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11649 break;
11650 if ((CVal & 0xFFFF) == CVal)
11651 break;
11652 if ((CVal & 0xFFFF0000ULL) == CVal)
11653 break;
11654 uint64_t NCVal = ~(uint32_t)CVal;
11655 if ((NCVal & 0xFFFFULL) == NCVal)
11656 break;
11657 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11658 break;
11659 return;
11660 }
11661 case 'N': {
11662 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11663 break;
11664 if ((CVal & 0xFFFFULL) == CVal)
11665 break;
11666 if ((CVal & 0xFFFF0000ULL) == CVal)
11667 break;
11668 if ((CVal & 0xFFFF00000000ULL) == CVal)
11669 break;
11670 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11671 break;
11672 uint64_t NCVal = ~CVal;
11673 if ((NCVal & 0xFFFFULL) == NCVal)
11674 break;
11675 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11676 break;
11677 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11678 break;
11679 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11680 break;
11681 return;
11682 }
11683 default:
11684 return;
11685 }
11686
11687 // All assembler immediates are 64-bit integers.
11688 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11689 break;
11690 }
11691
11692 if (Result.getNode()) {
11693 Ops.push_back(Result);
11694 return;
11695 }
11696
11697 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11698}
11699
11700//===----------------------------------------------------------------------===//
11701// AArch64 Advanced SIMD Support
11702//===----------------------------------------------------------------------===//
11703
11704/// WidenVector - Given a value in the V64 register class, produce the
11705/// equivalent value in the V128 register class.
11707 EVT VT = V64Reg.getValueType();
11708 unsigned NarrowSize = VT.getVectorNumElements();
11709 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11710 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11711 SDLoc DL(V64Reg);
11712
11713 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11714 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11715}
11716
11717/// getExtFactor - Determine the adjustment factor for the position when
11718/// generating an "extract from vector registers" instruction.
11719static unsigned getExtFactor(SDValue &V) {
11720 EVT EltType = V.getValueType().getVectorElementType();
11721 return EltType.getSizeInBits() / 8;
11722}
11723
11724// Check if a vector is built from one vector via extracted elements of
11725// another together with an AND mask, ensuring that all elements fit
11726// within range. This can be reconstructed using AND and NEON's TBL1.
11728 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11729 SDLoc dl(Op);
11730 EVT VT = Op.getValueType();
11731 assert(!VT.isScalableVector() &&
11732 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11733
11734 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11735 // directly to TBL1.
11736 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11737 return SDValue();
11738
11739 unsigned NumElts = VT.getVectorNumElements();
11740 assert((NumElts == 8 || NumElts == 16) &&
11741 "Need to have exactly 8 or 16 elements in vector.");
11742
11743 SDValue SourceVec;
11744 SDValue MaskSourceVec;
11745 SmallVector<SDValue, 16> AndMaskConstants;
11746
11747 for (unsigned i = 0; i < NumElts; ++i) {
11748 SDValue V = Op.getOperand(i);
11749 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11750 return SDValue();
11751
11752 SDValue OperandSourceVec = V.getOperand(0);
11753 if (!SourceVec)
11754 SourceVec = OperandSourceVec;
11755 else if (SourceVec != OperandSourceVec)
11756 return SDValue();
11757
11758 // This only looks at shuffles with elements that are
11759 // a) truncated by a constant AND mask extracted from a mask vector, or
11760 // b) extracted directly from a mask vector.
11761 SDValue MaskSource = V.getOperand(1);
11762 if (MaskSource.getOpcode() == ISD::AND) {
11763 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11764 return SDValue();
11765
11766 AndMaskConstants.push_back(MaskSource.getOperand(1));
11767 MaskSource = MaskSource->getOperand(0);
11768 } else if (!AndMaskConstants.empty()) {
11769 // Either all or no operands should have an AND mask.
11770 return SDValue();
11771 }
11772
11773 // An ANY_EXTEND may be inserted between the AND and the source vector
11774 // extraction. We don't care about that, so we can just skip it.
11775 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11776 MaskSource = MaskSource.getOperand(0);
11777
11778 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11779 return SDValue();
11780
11781 SDValue MaskIdx = MaskSource.getOperand(1);
11782 if (!isa<ConstantSDNode>(MaskIdx) ||
11783 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11784 return SDValue();
11785
11786 // We only apply this if all elements come from the same vector with the
11787 // same vector type.
11788 if (!MaskSourceVec) {
11789 MaskSourceVec = MaskSource->getOperand(0);
11790 if (MaskSourceVec.getValueType() != VT)
11791 return SDValue();
11792 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11793 return SDValue();
11794 }
11795 }
11796
11797 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11798 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11799 // insert, we know that the index in the mask must be smaller than the number
11800 // of elements in the source, or we would have an out-of-bounds access.
11801 if (NumElts == 8)
11802 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11803 DAG.getUNDEF(VT));
11804
11805 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11806 if (!AndMaskConstants.empty())
11807 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11808 DAG.getBuildVector(VT, dl, AndMaskConstants));
11809
11810 return DAG.getNode(
11812 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11813 MaskSourceVec);
11814}
11815
11816// Gather data to see if the operation can be modelled as a
11817// shuffle in combination with VEXTs.
11819 SelectionDAG &DAG) const {
11820 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11821 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11822 SDLoc dl(Op);
11823 EVT VT = Op.getValueType();
11824 assert(!VT.isScalableVector() &&
11825 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11826 unsigned NumElts = VT.getVectorNumElements();
11827
11828 struct ShuffleSourceInfo {
11829 SDValue Vec;
11830 unsigned MinElt;
11831 unsigned MaxElt;
11832
11833 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11834 // be compatible with the shuffle we intend to construct. As a result
11835 // ShuffleVec will be some sliding window into the original Vec.
11836 SDValue ShuffleVec;
11837
11838 // Code should guarantee that element i in Vec starts at element "WindowBase
11839 // + i * WindowScale in ShuffleVec".
11840 int WindowBase;
11841 int WindowScale;
11842
11843 ShuffleSourceInfo(SDValue Vec)
11844 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11845 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11846
11847 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11848 };
11849
11850 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11851 // node.
11853 for (unsigned i = 0; i < NumElts; ++i) {
11854 SDValue V = Op.getOperand(i);
11855 if (V.isUndef())
11856 continue;
11857 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11858 !isa<ConstantSDNode>(V.getOperand(1)) ||
11859 V.getOperand(0).getValueType().isScalableVector()) {
11860 LLVM_DEBUG(
11861 dbgs() << "Reshuffle failed: "
11862 "a shuffle can only come from building a vector from "
11863 "various elements of other fixed-width vectors, provided "
11864 "their indices are constant\n");
11865 return SDValue();
11866 }
11867
11868 // Add this element source to the list if it's not already there.
11869 SDValue SourceVec = V.getOperand(0);
11870 auto Source = find(Sources, SourceVec);
11871 if (Source == Sources.end())
11872 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11873
11874 // Update the minimum and maximum lane number seen.
11875 unsigned EltNo = V.getConstantOperandVal(1);
11876 Source->MinElt = std::min(Source->MinElt, EltNo);
11877 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11878 }
11879
11880 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11881 // better than moving to/from gpr registers for larger vectors.
11882 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11883 // Construct a mask for the tbl. We may need to adjust the index for types
11884 // larger than i8.
11886 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11887 for (unsigned I = 0; I < NumElts; ++I) {
11888 SDValue V = Op.getOperand(I);
11889 if (V.isUndef()) {
11890 for (unsigned OF = 0; OF < OutputFactor; OF++)
11891 Mask.push_back(-1);
11892 continue;
11893 }
11894 // Set the Mask lanes adjusted for the size of the input and output
11895 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11896 // output element, adjusted in their positions per input and output types.
11897 unsigned Lane = V.getConstantOperandVal(1);
11898 for (unsigned S = 0; S < Sources.size(); S++) {
11899 if (V.getOperand(0) == Sources[S].Vec) {
11900 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11901 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11902 for (unsigned OF = 0; OF < OutputFactor; OF++)
11903 Mask.push_back(InputBase + OF);
11904 break;
11905 }
11906 }
11907 }
11908
11909 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11910 // v16i8, and the TBLMask
11911 SmallVector<SDValue, 16> TBLOperands;
11912 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11913 ? Intrinsic::aarch64_neon_tbl3
11914 : Intrinsic::aarch64_neon_tbl4,
11915 dl, MVT::i32));
11916 for (unsigned i = 0; i < Sources.size(); i++) {
11917 SDValue Src = Sources[i].Vec;
11918 EVT SrcVT = Src.getValueType();
11919 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11920 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11921 "Expected a legally typed vector");
11922 if (SrcVT.is64BitVector())
11923 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11924 DAG.getUNDEF(MVT::v8i8));
11925 TBLOperands.push_back(Src);
11926 }
11927
11929 for (unsigned i = 0; i < Mask.size(); i++)
11930 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11931 assert((Mask.size() == 8 || Mask.size() == 16) &&
11932 "Expected a v8i8 or v16i8 Mask");
11933 TBLOperands.push_back(
11934 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11935
11936 SDValue Shuffle =
11938 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11939 return DAG.getBitcast(VT, Shuffle);
11940 }
11941
11942 if (Sources.size() > 2) {
11943 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11944 << "sensible when at most two source vectors are "
11945 << "involved\n");
11946 return SDValue();
11947 }
11948
11949 // Find out the smallest element size among result and two sources, and use
11950 // it as element size to build the shuffle_vector.
11951 EVT SmallestEltTy = VT.getVectorElementType();
11952 for (auto &Source : Sources) {
11953 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11954 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11955 SmallestEltTy = SrcEltTy;
11956 }
11957 }
11958 unsigned ResMultiplier =
11959 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11960 uint64_t VTSize = VT.getFixedSizeInBits();
11961 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11962 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11963
11964 // If the source vector is too wide or too narrow, we may nevertheless be able
11965 // to construct a compatible shuffle either by concatenating it with UNDEF or
11966 // extracting a suitable range of elements.
11967 for (auto &Src : Sources) {
11968 EVT SrcVT = Src.ShuffleVec.getValueType();
11969
11970 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11971 if (SrcVTSize == TypeSize::getFixed(VTSize))
11972 continue;
11973
11974 // This stage of the search produces a source with the same element type as
11975 // the original, but with a total width matching the BUILD_VECTOR output.
11976 EVT EltVT = SrcVT.getVectorElementType();
11977 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11978 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11979
11980 if (SrcVTSize.getFixedValue() < VTSize) {
11981 assert(2 * SrcVTSize == VTSize);
11982 // We can pad out the smaller vector for free, so if it's part of a
11983 // shuffle...
11984 Src.ShuffleVec =
11985 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11986 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11987 continue;
11988 }
11989
11990 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11991 LLVM_DEBUG(
11992 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11993 return SDValue();
11994 }
11995
11996 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11997 LLVM_DEBUG(
11998 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11999 return SDValue();
12000 }
12001
12002 if (Src.MinElt >= NumSrcElts) {
12003 // The extraction can just take the second half
12004 Src.ShuffleVec =
12005 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12006 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12007 Src.WindowBase = -NumSrcElts;
12008 } else if (Src.MaxElt < NumSrcElts) {
12009 // The extraction can just take the first half
12010 Src.ShuffleVec =
12011 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12012 DAG.getConstant(0, dl, MVT::i64));
12013 } else {
12014 // An actual VEXT is needed
12015 SDValue VEXTSrc1 =
12016 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12017 DAG.getConstant(0, dl, MVT::i64));
12018 SDValue VEXTSrc2 =
12019 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12020 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12021 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12022
12023 if (!SrcVT.is64BitVector()) {
12024 LLVM_DEBUG(
12025 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12026 "for SVE vectors.");
12027 return SDValue();
12028 }
12029
12030 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12031 VEXTSrc2,
12032 DAG.getConstant(Imm, dl, MVT::i32));
12033 Src.WindowBase = -Src.MinElt;
12034 }
12035 }
12036
12037 // Another possible incompatibility occurs from the vector element types. We
12038 // can fix this by bitcasting the source vectors to the same type we intend
12039 // for the shuffle.
12040 for (auto &Src : Sources) {
12041 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
12042 if (SrcEltTy == SmallestEltTy)
12043 continue;
12044 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
12045 if (DAG.getDataLayout().isBigEndian()) {
12046 Src.ShuffleVec =
12047 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
12048 } else {
12049 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
12050 }
12051 Src.WindowScale =
12052 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12053 Src.WindowBase *= Src.WindowScale;
12054 }
12055
12056 // Final check before we try to actually produce a shuffle.
12057 LLVM_DEBUG(for (auto Src
12058 : Sources)
12059 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
12060
12061 // The stars all align, our next step is to produce the mask for the shuffle.
12062 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
12063 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
12064 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
12065 SDValue Entry = Op.getOperand(i);
12066 if (Entry.isUndef())
12067 continue;
12068
12069 auto Src = find(Sources, Entry.getOperand(0));
12070 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
12071
12072 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
12073 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
12074 // segment.
12075 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
12076 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
12077 VT.getScalarSizeInBits());
12078 int LanesDefined = BitsDefined / BitsPerShuffleLane;
12079
12080 // This source is expected to fill ResMultiplier lanes of the final shuffle,
12081 // starting at the appropriate offset.
12082 int *LaneMask = &Mask[i * ResMultiplier];
12083
12084 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
12085 ExtractBase += NumElts * (Src - Sources.begin());
12086 for (int j = 0; j < LanesDefined; ++j)
12087 LaneMask[j] = ExtractBase + j;
12088 }
12089
12090 // Final check before we try to produce nonsense...
12091 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
12092 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
12093 return SDValue();
12094 }
12095
12096 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
12097 for (unsigned i = 0; i < Sources.size(); ++i)
12098 ShuffleOps[i] = Sources[i].ShuffleVec;
12099
12100 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
12101 ShuffleOps[1], Mask);
12102 SDValue V;
12103 if (DAG.getDataLayout().isBigEndian()) {
12104 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
12105 } else {
12106 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
12107 }
12108
12109 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
12110 dbgs() << "Reshuffle, creating node: "; V.dump(););
12111
12112 return V;
12113}
12114
12115// check if an EXT instruction can handle the shuffle mask when the
12116// vector sources of the shuffle are the same.
12117static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
12118 unsigned NumElts = VT.getVectorNumElements();
12119
12120 // Assume that the first shuffle index is not UNDEF. Fail if it is.
12121 if (M[0] < 0)
12122 return false;
12123
12124 Imm = M[0];
12125
12126 // If this is a VEXT shuffle, the immediate value is the index of the first
12127 // element. The other shuffle indices must be the successive elements after
12128 // the first one.
12129 unsigned ExpectedElt = Imm;
12130 for (unsigned i = 1; i < NumElts; ++i) {
12131 // Increment the expected index. If it wraps around, just follow it
12132 // back to index zero and keep going.
12133 ++ExpectedElt;
12134 if (ExpectedElt == NumElts)
12135 ExpectedElt = 0;
12136
12137 if (M[i] < 0)
12138 continue; // ignore UNDEF indices
12139 if (ExpectedElt != static_cast<unsigned>(M[i]))
12140 return false;
12141 }
12142
12143 return true;
12144}
12145
12146// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12147// v4i32s. This is really a truncate, which we can construct out of (legal)
12148// concats and truncate nodes.
12150 if (V.getValueType() != MVT::v16i8)
12151 return SDValue();
12152 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
12153
12154 for (unsigned X = 0; X < 4; X++) {
12155 // Check the first item in each group is an extract from lane 0 of a v4i32
12156 // or v4i16.
12157 SDValue BaseExt = V.getOperand(X * 4);
12158 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12159 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
12160 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
12161 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
12162 BaseExt.getConstantOperandVal(1) != 0)
12163 return SDValue();
12164 SDValue Base = BaseExt.getOperand(0);
12165 // And check the other items are extracts from the same vector.
12166 for (unsigned Y = 1; Y < 4; Y++) {
12167 SDValue Ext = V.getOperand(X * 4 + Y);
12168 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12169 Ext.getOperand(0) != Base ||
12170 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
12171 Ext.getConstantOperandVal(1) != Y)
12172 return SDValue();
12173 }
12174 }
12175
12176 // Turn the buildvector into a series of truncates and concates, which will
12177 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
12178 // concat together to produce 2 v8i16. These are both truncated and concat
12179 // together.
12180 SDLoc DL(V);
12181 SDValue Trunc[4] = {
12182 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
12183 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
12184 for (SDValue &V : Trunc)
12185 if (V.getValueType() == MVT::v4i32)
12186 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
12187 SDValue Concat0 =
12188 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
12189 SDValue Concat1 =
12190 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
12191 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
12192 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
12193 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
12194}
12195
12196/// Check if a vector shuffle corresponds to a DUP instructions with a larger
12197/// element width than the vector lane type. If that is the case the function
12198/// returns true and writes the value of the DUP instruction lane operand into
12199/// DupLaneOp
12200static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
12201 unsigned &DupLaneOp) {
12202 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
12203 "Only possible block sizes for wide DUP are: 16, 32, 64");
12204
12205 if (BlockSize <= VT.getScalarSizeInBits())
12206 return false;
12207 if (BlockSize % VT.getScalarSizeInBits() != 0)
12208 return false;
12209 if (VT.getSizeInBits() % BlockSize != 0)
12210 return false;
12211
12212 size_t SingleVecNumElements = VT.getVectorNumElements();
12213 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
12214 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
12215
12216 // We are looking for masks like
12217 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
12218 // might be replaced by 'undefined'. BlockIndices will eventually contain
12219 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
12220 // for the above examples)
12221 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
12222 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
12223 for (size_t I = 0; I < NumEltsPerBlock; I++) {
12224 int Elt = M[BlockIndex * NumEltsPerBlock + I];
12225 if (Elt < 0)
12226 continue;
12227 // For now we don't support shuffles that use the second operand
12228 if ((unsigned)Elt >= SingleVecNumElements)
12229 return false;
12230 if (BlockElts[I] < 0)
12231 BlockElts[I] = Elt;
12232 else if (BlockElts[I] != Elt)
12233 return false;
12234 }
12235
12236 // We found a candidate block (possibly with some undefs). It must be a
12237 // sequence of consecutive integers starting with a value divisible by
12238 // NumEltsPerBlock with some values possibly replaced by undef-s.
12239
12240 // Find first non-undef element
12241 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
12242 assert(FirstRealEltIter != BlockElts.end() &&
12243 "Shuffle with all-undefs must have been caught by previous cases, "
12244 "e.g. isSplat()");
12245 if (FirstRealEltIter == BlockElts.end()) {
12246 DupLaneOp = 0;
12247 return true;
12248 }
12249
12250 // Index of FirstRealElt in BlockElts
12251 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
12252
12253 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
12254 return false;
12255 // BlockElts[0] must have the following value if it isn't undef:
12256 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
12257
12258 // Check the first element
12259 if (Elt0 % NumEltsPerBlock != 0)
12260 return false;
12261 // Check that the sequence indeed consists of consecutive integers (modulo
12262 // undefs)
12263 for (size_t I = 0; I < NumEltsPerBlock; I++)
12264 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
12265 return false;
12266
12267 DupLaneOp = Elt0 / NumEltsPerBlock;
12268 return true;
12269}
12270
12271// check if an EXT instruction can handle the shuffle mask when the
12272// vector sources of the shuffle are different.
12273static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
12274 unsigned &Imm) {
12275 // Look for the first non-undef element.
12276 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
12277
12278 // Benefit form APInt to handle overflow when calculating expected element.
12279 unsigned NumElts = VT.getVectorNumElements();
12280 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
12281 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
12282 // The following shuffle indices must be the successive elements after the
12283 // first real element.
12284 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
12285 return Elt != ExpectedElt++ && Elt != -1;
12286 });
12287 if (FoundWrongElt)
12288 return false;
12289
12290 // The index of an EXT is the first element if it is not UNDEF.
12291 // Watch out for the beginning UNDEFs. The EXT index should be the expected
12292 // value of the first element. E.g.
12293 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
12294 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
12295 // ExpectedElt is the last mask index plus 1.
12296 Imm = ExpectedElt.getZExtValue();
12297
12298 // There are two difference cases requiring to reverse input vectors.
12299 // For example, for vector <4 x i32> we have the following cases,
12300 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
12301 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
12302 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
12303 // to reverse two input vectors.
12304 if (Imm < NumElts)
12305 ReverseEXT = true;
12306 else
12307 Imm -= NumElts;
12308
12309 return true;
12310}
12311
12312/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
12313/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12314/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
12315static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12316 unsigned NumElts = VT.getVectorNumElements();
12317 if (NumElts % 2 != 0)
12318 return false;
12319 WhichResult = (M[0] == 0 ? 0 : 1);
12320 unsigned Idx = WhichResult * NumElts / 2;
12321 for (unsigned i = 0; i != NumElts; i += 2) {
12322 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
12323 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
12324 return false;
12325 Idx += 1;
12326 }
12327
12328 return true;
12329}
12330
12331/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
12332/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12333/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
12334static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12335 unsigned Half = VT.getVectorNumElements() / 2;
12336 WhichResult = (M[0] == 0 ? 0 : 1);
12337 for (unsigned j = 0; j != 2; ++j) {
12338 unsigned Idx = WhichResult;
12339 for (unsigned i = 0; i != Half; ++i) {
12340 int MIdx = M[i + j * Half];
12341 if (MIdx >= 0 && (unsigned)MIdx != Idx)
12342 return false;
12343 Idx += 2;
12344 }
12345 }
12346
12347 return true;
12348}
12349
12350/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
12351/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12352/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
12353static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12354 unsigned NumElts = VT.getVectorNumElements();
12355 if (NumElts % 2 != 0)
12356 return false;
12357 WhichResult = (M[0] == 0 ? 0 : 1);
12358 for (unsigned i = 0; i < NumElts; i += 2) {
12359 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
12360 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
12361 return false;
12362 }
12363 return true;
12364}
12365
12366static bool isINSMask(ArrayRef<int> M, int NumInputElements,
12367 bool &DstIsLeft, int &Anomaly) {
12368 if (M.size() != static_cast<size_t>(NumInputElements))
12369 return false;
12370
12371 int NumLHSMatch = 0, NumRHSMatch = 0;
12372 int LastLHSMismatch = -1, LastRHSMismatch = -1;
12373
12374 for (int i = 0; i < NumInputElements; ++i) {
12375 if (M[i] == -1) {
12376 ++NumLHSMatch;
12377 ++NumRHSMatch;
12378 continue;
12379 }
12380
12381 if (M[i] == i)
12382 ++NumLHSMatch;
12383 else
12384 LastLHSMismatch = i;
12385
12386 if (M[i] == i + NumInputElements)
12387 ++NumRHSMatch;
12388 else
12389 LastRHSMismatch = i;
12390 }
12391
12392 if (NumLHSMatch == NumInputElements - 1) {
12393 DstIsLeft = true;
12394 Anomaly = LastLHSMismatch;
12395 return true;
12396 } else if (NumRHSMatch == NumInputElements - 1) {
12397 DstIsLeft = false;
12398 Anomaly = LastRHSMismatch;
12399 return true;
12400 }
12401
12402 return false;
12403}
12404
12405static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12406 if (VT.getSizeInBits() != 128)
12407 return false;
12408
12409 unsigned NumElts = VT.getVectorNumElements();
12410
12411 for (int I = 0, E = NumElts / 2; I != E; I++) {
12412 if (Mask[I] != I)
12413 return false;
12414 }
12415
12416 int Offset = NumElts / 2;
12417 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12418 if (Mask[I] != I + SplitLHS * Offset)
12419 return false;
12420 }
12421
12422 return true;
12423}
12424
12426 SDLoc DL(Op);
12427 EVT VT = Op.getValueType();
12428 SDValue V0 = Op.getOperand(0);
12429 SDValue V1 = Op.getOperand(1);
12430 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12431
12434 return SDValue();
12435
12436 bool SplitV0 = V0.getValueSizeInBits() == 128;
12437
12438 if (!isConcatMask(Mask, VT, SplitV0))
12439 return SDValue();
12440
12441 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12442 if (SplitV0) {
12443 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12444 DAG.getConstant(0, DL, MVT::i64));
12445 }
12446 if (V1.getValueSizeInBits() == 128) {
12447 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12448 DAG.getConstant(0, DL, MVT::i64));
12449 }
12450 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12451}
12452
12453/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12454/// the specified operations to build the shuffle. ID is the perfect-shuffle
12455//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12456//table entry and LHS/RHS are the immediate inputs for this stage of the
12457//shuffle.
12459 SDValue V2, unsigned PFEntry, SDValue LHS,
12460 SDValue RHS, SelectionDAG &DAG,
12461 const SDLoc &dl) {
12462 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12463 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12464 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12465
12466 enum {
12467 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12468 OP_VREV,
12469 OP_VDUP0,
12470 OP_VDUP1,
12471 OP_VDUP2,
12472 OP_VDUP3,
12473 OP_VEXT1,
12474 OP_VEXT2,
12475 OP_VEXT3,
12476 OP_VUZPL, // VUZP, left result
12477 OP_VUZPR, // VUZP, right result
12478 OP_VZIPL, // VZIP, left result
12479 OP_VZIPR, // VZIP, right result
12480 OP_VTRNL, // VTRN, left result
12481 OP_VTRNR, // VTRN, right result
12482 OP_MOVLANE // Move lane. RHSID is the lane to move into
12483 };
12484
12485 if (OpNum == OP_COPY) {
12486 if (LHSID == (1 * 9 + 2) * 9 + 3)
12487 return LHS;
12488 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12489 return RHS;
12490 }
12491
12492 if (OpNum == OP_MOVLANE) {
12493 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12494 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12495 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12496 Elt = 3 - Elt;
12497 while (Elt > 0) {
12498 ID /= 9;
12499 Elt--;
12500 }
12501 return (ID % 9 == 8) ? -1 : ID % 9;
12502 };
12503
12504 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12505 // get the lane to move from the PFID, which is always from the
12506 // original vectors (V1 or V2).
12508 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12509 EVT VT = OpLHS.getValueType();
12510 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12511 unsigned ExtLane = 0;
12512 SDValue Input;
12513
12514 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12515 // convert into a higher type.
12516 if (RHSID & 0x4) {
12517 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12518 if (MaskElt == -1)
12519 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12520 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12521 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12522 Input = MaskElt < 2 ? V1 : V2;
12523 if (VT.getScalarSizeInBits() == 16) {
12524 Input = DAG.getBitcast(MVT::v2f32, Input);
12525 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12526 } else {
12527 assert(VT.getScalarSizeInBits() == 32 &&
12528 "Expected 16 or 32 bit shuffle elemements");
12529 Input = DAG.getBitcast(MVT::v2f64, Input);
12530 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12531 }
12532 } else {
12533 int MaskElt = getPFIDLane(ID, RHSID);
12534 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12535 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12536 Input = MaskElt < 4 ? V1 : V2;
12537 // Be careful about creating illegal types. Use f16 instead of i16.
12538 if (VT == MVT::v4i16) {
12539 Input = DAG.getBitcast(MVT::v4f16, Input);
12540 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12541 }
12542 }
12545 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12546 SDValue Ins =
12547 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12548 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12549 return DAG.getBitcast(VT, Ins);
12550 }
12551
12552 SDValue OpLHS, OpRHS;
12553 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12554 RHS, DAG, dl);
12555 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12556 RHS, DAG, dl);
12557 EVT VT = OpLHS.getValueType();
12558
12559 switch (OpNum) {
12560 default:
12561 llvm_unreachable("Unknown shuffle opcode!");
12562 case OP_VREV:
12563 // VREV divides the vector in half and swaps within the half.
12564 if (VT.getVectorElementType() == MVT::i32 ||
12565 VT.getVectorElementType() == MVT::f32)
12566 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12567 // vrev <4 x i16> -> REV32
12568 if (VT.getVectorElementType() == MVT::i16 ||
12569 VT.getVectorElementType() == MVT::f16 ||
12570 VT.getVectorElementType() == MVT::bf16)
12571 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12572 // vrev <4 x i8> -> REV16
12573 assert(VT.getVectorElementType() == MVT::i8);
12574 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12575 case OP_VDUP0:
12576 case OP_VDUP1:
12577 case OP_VDUP2:
12578 case OP_VDUP3: {
12579 EVT EltTy = VT.getVectorElementType();
12580 unsigned Opcode;
12581 if (EltTy == MVT::i8)
12582 Opcode = AArch64ISD::DUPLANE8;
12583 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12584 Opcode = AArch64ISD::DUPLANE16;
12585 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12586 Opcode = AArch64ISD::DUPLANE32;
12587 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12588 Opcode = AArch64ISD::DUPLANE64;
12589 else
12590 llvm_unreachable("Invalid vector element type?");
12591
12592 if (VT.getSizeInBits() == 64)
12593 OpLHS = WidenVector(OpLHS, DAG);
12594 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12595 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12596 }
12597 case OP_VEXT1:
12598 case OP_VEXT2:
12599 case OP_VEXT3: {
12600 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12601 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12602 DAG.getConstant(Imm, dl, MVT::i32));
12603 }
12604 case OP_VUZPL:
12605 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12606 case OP_VUZPR:
12607 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12608 case OP_VZIPL:
12609 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12610 case OP_VZIPR:
12611 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12612 case OP_VTRNL:
12613 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12614 case OP_VTRNR:
12615 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12616 }
12617}
12618
12620 SelectionDAG &DAG) {
12621 // Check to see if we can use the TBL instruction.
12622 SDValue V1 = Op.getOperand(0);
12623 SDValue V2 = Op.getOperand(1);
12624 SDLoc DL(Op);
12625
12626 EVT EltVT = Op.getValueType().getVectorElementType();
12627 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12628
12629 bool Swap = false;
12630 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12631 std::swap(V1, V2);
12632 Swap = true;
12633 }
12634
12635 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12636 // out of range values with 0s. We do need to make sure that any out-of-range
12637 // values are really out-of-range for a v16i8 vector.
12638 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12639 MVT IndexVT = MVT::v8i8;
12640 unsigned IndexLen = 8;
12641 if (Op.getValueSizeInBits() == 128) {
12642 IndexVT = MVT::v16i8;
12643 IndexLen = 16;
12644 }
12645
12647 for (int Val : ShuffleMask) {
12648 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12649 unsigned Offset = Byte + Val * BytesPerElt;
12650 if (Swap)
12651 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12652 if (IsUndefOrZero && Offset >= IndexLen)
12653 Offset = 255;
12654 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12655 }
12656 }
12657
12658 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12659 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12660
12661 SDValue Shuffle;
12662 if (IsUndefOrZero) {
12663 if (IndexLen == 8)
12664 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12665 Shuffle = DAG.getNode(
12666 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12667 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12668 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12669 } else {
12670 if (IndexLen == 8) {
12671 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12672 Shuffle = DAG.getNode(
12673 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12674 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12675 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12676 } else {
12677 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12678 // cannot currently represent the register constraints on the input
12679 // table registers.
12680 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12681 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12682 // IndexLen));
12683 Shuffle = DAG.getNode(
12684 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12685 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12686 V2Cst,
12687 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12688 }
12689 }
12690 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12691}
12692
12693static unsigned getDUPLANEOp(EVT EltType) {
12694 if (EltType == MVT::i8)
12695 return AArch64ISD::DUPLANE8;
12696 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12697 return AArch64ISD::DUPLANE16;
12698 if (EltType == MVT::i32 || EltType == MVT::f32)
12699 return AArch64ISD::DUPLANE32;
12700 if (EltType == MVT::i64 || EltType == MVT::f64)
12701 return AArch64ISD::DUPLANE64;
12702
12703 llvm_unreachable("Invalid vector element type?");
12704}
12705
12706static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12707 unsigned Opcode, SelectionDAG &DAG) {
12708 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12709 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12710 // Match: dup (bitcast (extract_subv X, C)), LaneC
12711 if (BitCast.getOpcode() != ISD::BITCAST ||
12713 return false;
12714
12715 // The extract index must align in the destination type. That may not
12716 // happen if the bitcast is from narrow to wide type.
12717 SDValue Extract = BitCast.getOperand(0);
12718 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12719 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12720 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12721 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12722 if (ExtIdxInBits % CastedEltBitWidth != 0)
12723 return false;
12724
12725 // Can't handle cases where vector size is not 128-bit
12726 if (!Extract.getOperand(0).getValueType().is128BitVector())
12727 return false;
12728
12729 // Update the lane value by offsetting with the scaled extract index.
12730 LaneC += ExtIdxInBits / CastedEltBitWidth;
12731
12732 // Determine the casted vector type of the wide vector input.
12733 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12734 // Examples:
12735 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12736 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12737 unsigned SrcVecNumElts =
12738 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12740 SrcVecNumElts);
12741 return true;
12742 };
12743 MVT CastVT;
12744 if (getScaledOffsetDup(V, Lane, CastVT)) {
12745 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12746 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12747 V.getOperand(0).getValueType().is128BitVector()) {
12748 // The lane is incremented by the index of the extract.
12749 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12750 Lane += V.getConstantOperandVal(1);
12751 V = V.getOperand(0);
12752 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12753 // The lane is decremented if we are splatting from the 2nd operand.
12754 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12755 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12756 Lane -= Idx * VT.getVectorNumElements() / 2;
12757 V = WidenVector(V.getOperand(Idx), DAG);
12758 } else if (VT.getSizeInBits() == 64) {
12759 // Widen the operand to 128-bit register with undef.
12760 V = WidenVector(V, DAG);
12761 }
12762 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12763}
12764
12765// Return true if we can get a new shuffle mask by checking the parameter mask
12766// array to test whether every two adjacent mask values are continuous and
12767// starting from an even number.
12769 SmallVectorImpl<int> &NewMask) {
12770 unsigned NumElts = VT.getVectorNumElements();
12771 if (NumElts % 2 != 0)
12772 return false;
12773
12774 NewMask.clear();
12775 for (unsigned i = 0; i < NumElts; i += 2) {
12776 int M0 = M[i];
12777 int M1 = M[i + 1];
12778
12779 // If both elements are undef, new mask is undef too.
12780 if (M0 == -1 && M1 == -1) {
12781 NewMask.push_back(-1);
12782 continue;
12783 }
12784
12785 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12786 NewMask.push_back(M1 / 2);
12787 continue;
12788 }
12789
12790 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12791 NewMask.push_back(M0 / 2);
12792 continue;
12793 }
12794
12795 NewMask.clear();
12796 return false;
12797 }
12798
12799 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12800 return true;
12801}
12802
12803// Try to widen element type to get a new mask value for a better permutation
12804// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12805// UZP1/2, TRN1/2, REV, INS, etc.
12806// For example:
12807// shufflevector <4 x i32> %a, <4 x i32> %b,
12808// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12809// is equivalent to:
12810// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12811// Finally, we can get:
12812// mov v0.d[0], v1.d[1]
12814 SDLoc DL(Op);
12815 EVT VT = Op.getValueType();
12816 EVT ScalarVT = VT.getVectorElementType();
12817 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12818 SDValue V0 = Op.getOperand(0);
12819 SDValue V1 = Op.getOperand(1);
12820 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12821
12822 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12823 // We need to make sure the wider element type is legal. Thus, ElementSize
12824 // should be not larger than 32 bits, and i1 type should also be excluded.
12825 if (ElementSize > 32 || ElementSize == 1)
12826 return SDValue();
12827
12828 SmallVector<int, 8> NewMask;
12829 if (isWideTypeMask(Mask, VT, NewMask)) {
12830 MVT NewEltVT = VT.isFloatingPoint()
12831 ? MVT::getFloatingPointVT(ElementSize * 2)
12832 : MVT::getIntegerVT(ElementSize * 2);
12833 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12834 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12835 V0 = DAG.getBitcast(NewVT, V0);
12836 V1 = DAG.getBitcast(NewVT, V1);
12837 return DAG.getBitcast(VT,
12838 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12839 }
12840 }
12841
12842 return SDValue();
12843}
12844
12845// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12847 ArrayRef<int> ShuffleMask,
12848 SelectionDAG &DAG) {
12849 SDValue Tbl1 = Op->getOperand(0);
12850 SDValue Tbl2 = Op->getOperand(1);
12851 SDLoc dl(Op);
12852 SDValue Tbl2ID =
12853 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12854
12855 EVT VT = Op.getValueType();
12856 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12857 Tbl1->getOperand(0) != Tbl2ID ||
12859 Tbl2->getOperand(0) != Tbl2ID)
12860 return SDValue();
12861
12862 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12863 Tbl2->getValueType(0) != MVT::v16i8)
12864 return SDValue();
12865
12866 SDValue Mask1 = Tbl1->getOperand(3);
12867 SDValue Mask2 = Tbl2->getOperand(3);
12868 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12869 for (unsigned I = 0; I < 16; I++) {
12870 if (ShuffleMask[I] < 16)
12871 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12872 else {
12873 auto *C =
12874 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12875 if (!C)
12876 return SDValue();
12877 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12878 }
12879 }
12880
12881 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12882 SDValue ID =
12883 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12884
12885 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12886 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12887 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12888}
12889
12890// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12891// but we don't have an appropriate instruction,
12892// so custom-lower it as ZIP1-with-zeros.
12893SDValue
12894AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12895 SelectionDAG &DAG) const {
12896 SDLoc dl(Op);
12897 EVT VT = Op.getValueType();
12898 SDValue SrcOp = Op.getOperand(0);
12899 EVT SrcVT = SrcOp.getValueType();
12900 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12901 "Unexpected extension factor.");
12902 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12903 // FIXME: support multi-step zipping?
12904 if (Scale != 2)
12905 return SDValue();
12906 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12907 return DAG.getBitcast(VT,
12908 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12909}
12910
12911SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12912 SelectionDAG &DAG) const {
12913 SDLoc dl(Op);
12914 EVT VT = Op.getValueType();
12915
12916 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12917
12918 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12919 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12920
12921 // Convert shuffles that are directly supported on NEON to target-specific
12922 // DAG nodes, instead of keeping them as shuffles and matching them again
12923 // during code selection. This is more efficient and avoids the possibility
12924 // of inconsistencies between legalization and selection.
12925 ArrayRef<int> ShuffleMask = SVN->getMask();
12926
12927 SDValue V1 = Op.getOperand(0);
12928 SDValue V2 = Op.getOperand(1);
12929
12930 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12931 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12932 "Unexpected VECTOR_SHUFFLE mask size!");
12933
12934 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12935 return Res;
12936
12937 if (SVN->isSplat()) {
12938 int Lane = SVN->getSplatIndex();
12939 // If this is undef splat, generate it via "just" vdup, if possible.
12940 if (Lane == -1)
12941 Lane = 0;
12942
12943 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12944 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12945 V1.getOperand(0));
12946 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12947 // constant. If so, we can just reference the lane's definition directly.
12948 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12949 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12950 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12951
12952 // Otherwise, duplicate from the lane of the input vector.
12953 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12954 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12955 }
12956
12957 // Check if the mask matches a DUP for a wider element
12958 for (unsigned LaneSize : {64U, 32U, 16U}) {
12959 unsigned Lane = 0;
12960 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12961 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12962 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12964 // Cast V1 to an integer vector with required lane size
12965 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12966 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12967 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12968 V1 = DAG.getBitcast(NewVecTy, V1);
12969 // Constuct the DUP instruction
12970 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12971 // Cast back to the original type
12972 return DAG.getBitcast(VT, V1);
12973 }
12974 }
12975
12976 unsigned NumElts = VT.getVectorNumElements();
12977 unsigned EltSize = VT.getScalarSizeInBits();
12978 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
12979 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12980 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
12981 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12982 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
12983 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12984
12985 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
12986 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12987 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12988 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12989 DAG.getConstant(8, dl, MVT::i32));
12990 }
12991
12992 bool ReverseEXT = false;
12993 unsigned Imm;
12994 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12995 if (ReverseEXT)
12996 std::swap(V1, V2);
12997 Imm *= getExtFactor(V1);
12998 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12999 DAG.getConstant(Imm, dl, MVT::i32));
13000 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13001 Imm *= getExtFactor(V1);
13002 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13003 DAG.getConstant(Imm, dl, MVT::i32));
13004 }
13005
13006 unsigned WhichResult;
13007 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13008 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13009 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13010 }
13011 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13012 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13013 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13014 }
13015 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13016 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13017 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13018 }
13019
13020 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13021 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13022 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13023 }
13024 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13025 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13026 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13027 }
13028 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13029 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13030 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13031 }
13032
13034 return Concat;
13035
13036 bool DstIsLeft;
13037 int Anomaly;
13038 int NumInputElements = V1.getValueType().getVectorNumElements();
13039 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13040 SDValue DstVec = DstIsLeft ? V1 : V2;
13041 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13042
13043 SDValue SrcVec = V1;
13044 int SrcLane = ShuffleMask[Anomaly];
13045 if (SrcLane >= NumInputElements) {
13046 SrcVec = V2;
13047 SrcLane -= NumElts;
13048 }
13049 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13050
13051 EVT ScalarVT = VT.getVectorElementType();
13052
13053 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13054 ScalarVT = MVT::i32;
13055
13056 return DAG.getNode(
13057 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13058 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13059 DstLaneV);
13060 }
13061
13062 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13063 return NewSD;
13064
13065 // If the shuffle is not directly supported and it has 4 elements, use
13066 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13067 if (NumElts == 4) {
13068 unsigned PFIndexes[4];
13069 for (unsigned i = 0; i != 4; ++i) {
13070 if (ShuffleMask[i] < 0)
13071 PFIndexes[i] = 8;
13072 else
13073 PFIndexes[i] = ShuffleMask[i];
13074 }
13075
13076 // Compute the index in the perfect shuffle table.
13077 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
13078 PFIndexes[2] * 9 + PFIndexes[3];
13079 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
13080 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
13081 dl);
13082 }
13083
13084 return GenerateTBL(Op, ShuffleMask, DAG);
13085}
13086
13087SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
13088 SelectionDAG &DAG) const {
13089 EVT VT = Op.getValueType();
13090
13091 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13092 return LowerToScalableOp(Op, DAG);
13093
13094 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
13095 "Unexpected vector type!");
13096
13097 // We can handle the constant cases during isel.
13098 if (isa<ConstantSDNode>(Op.getOperand(0)))
13099 return Op;
13100
13101 // There isn't a natural way to handle the general i1 case, so we use some
13102 // trickery with whilelo.
13103 SDLoc DL(Op);
13104 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
13105 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
13106 DAG.getValueType(MVT::i1));
13107 SDValue ID =
13108 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
13109 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13110 if (VT == MVT::nxv1i1)
13111 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
13112 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
13113 Zero, SplatVal),
13114 Zero);
13115 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
13116}
13117
13118SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
13119 SelectionDAG &DAG) const {
13120 SDLoc DL(Op);
13121
13122 EVT VT = Op.getValueType();
13123 if (!isTypeLegal(VT) || !VT.isScalableVector())
13124 return SDValue();
13125
13126 // Current lowering only supports the SVE-ACLE types.
13128 return SDValue();
13129
13130 // The DUPQ operation is indepedent of element type so normalise to i64s.
13131 SDValue Idx128 = Op.getOperand(2);
13132
13133 // DUPQ can be used when idx is in range.
13134 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
13135 if (CIdx && (CIdx->getZExtValue() <= 3)) {
13136 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
13137 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
13138 }
13139
13140 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
13141
13142 // The ACLE says this must produce the same result as:
13143 // svtbl(data, svadd_x(svptrue_b64(),
13144 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
13145 // index * 2))
13146 SDValue One = DAG.getConstant(1, DL, MVT::i64);
13147 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
13148
13149 // create the vector 0,1,0,1,...
13150 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
13151 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
13152
13153 // create the vector idx64,idx64+1,idx64,idx64+1,...
13154 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
13155 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
13156 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
13157
13158 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
13159 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
13160 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
13161}
13162
13163
13164static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
13165 APInt &UndefBits) {
13166 EVT VT = BVN->getValueType(0);
13167 APInt SplatBits, SplatUndef;
13168 unsigned SplatBitSize;
13169 bool HasAnyUndefs;
13170 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13171 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
13172
13173 for (unsigned i = 0; i < NumSplats; ++i) {
13174 CnstBits <<= SplatBitSize;
13175 UndefBits <<= SplatBitSize;
13176 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
13177 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
13178 }
13179
13180 return true;
13181 }
13182
13183 return false;
13184}
13185
13186// Try 64-bit splatted SIMD immediate.
13187static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13188 const APInt &Bits) {
13189 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13190 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13191 EVT VT = Op.getValueType();
13192 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
13193
13196
13197 SDLoc dl(Op);
13198 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13199 DAG.getConstant(Value, dl, MVT::i32));
13200 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13201 }
13202 }
13203
13204 return SDValue();
13205}
13206
13207// Try 32-bit splatted SIMD immediate.
13208static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13209 const APInt &Bits,
13210 const SDValue *LHS = nullptr) {
13211 EVT VT = Op.getValueType();
13212 if (VT.isFixedLengthVector() &&
13214 return SDValue();
13215
13216 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13217 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13218 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13219 bool isAdvSIMDModImm = false;
13220 uint64_t Shift;
13221
13222 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
13224 Shift = 0;
13225 }
13226 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
13228 Shift = 8;
13229 }
13230 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
13232 Shift = 16;
13233 }
13234 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
13236 Shift = 24;
13237 }
13238
13239 if (isAdvSIMDModImm) {
13240 SDLoc dl(Op);
13241 SDValue Mov;
13242
13243 if (LHS)
13244 Mov = DAG.getNode(NewOp, dl, MovTy,
13245 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
13246 DAG.getConstant(Value, dl, MVT::i32),
13247 DAG.getConstant(Shift, dl, MVT::i32));
13248 else
13249 Mov = DAG.getNode(NewOp, dl, MovTy,
13250 DAG.getConstant(Value, dl, MVT::i32),
13251 DAG.getConstant(Shift, dl, MVT::i32));
13252
13253 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13254 }
13255 }
13256
13257 return SDValue();
13258}
13259
13260// Try 16-bit splatted SIMD immediate.
13261static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13262 const APInt &Bits,
13263 const SDValue *LHS = nullptr) {
13264 EVT VT = Op.getValueType();
13265 if (VT.isFixedLengthVector() &&
13267 return SDValue();
13268
13269 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13270 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13271 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
13272 bool isAdvSIMDModImm = false;
13273 uint64_t Shift;
13274
13275 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
13277 Shift = 0;
13278 }
13279 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
13281 Shift = 8;
13282 }
13283
13284 if (isAdvSIMDModImm) {
13285 SDLoc dl(Op);
13286 SDValue Mov;
13287
13288 if (LHS)
13289 Mov = DAG.getNode(NewOp, dl, MovTy,
13290 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
13291 DAG.getConstant(Value, dl, MVT::i32),
13292 DAG.getConstant(Shift, dl, MVT::i32));
13293 else
13294 Mov = DAG.getNode(NewOp, dl, MovTy,
13295 DAG.getConstant(Value, dl, MVT::i32),
13296 DAG.getConstant(Shift, dl, MVT::i32));
13297
13298 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13299 }
13300 }
13301
13302 return SDValue();
13303}
13304
13305// Try 32-bit splatted SIMD immediate with shifted ones.
13307 SelectionDAG &DAG, const APInt &Bits) {
13308 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13309 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13310 EVT VT = Op.getValueType();
13311 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13312 bool isAdvSIMDModImm = false;
13313 uint64_t Shift;
13314
13315 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
13317 Shift = 264;
13318 }
13319 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
13321 Shift = 272;
13322 }
13323
13324 if (isAdvSIMDModImm) {
13325 SDLoc dl(Op);
13326 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13327 DAG.getConstant(Value, dl, MVT::i32),
13328 DAG.getConstant(Shift, dl, MVT::i32));
13329 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13330 }
13331 }
13332
13333 return SDValue();
13334}
13335
13336// Try 8-bit splatted SIMD immediate.
13337static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13338 const APInt &Bits) {
13339 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13340 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13341 EVT VT = Op.getValueType();
13342 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
13343
13346
13347 SDLoc dl(Op);
13348 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13349 DAG.getConstant(Value, dl, MVT::i32));
13350 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13351 }
13352 }
13353
13354 return SDValue();
13355}
13356
13357// Try FP splatted SIMD immediate.
13358static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13359 const APInt &Bits) {
13360 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13361 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13362 EVT VT = Op.getValueType();
13363 bool isWide = (VT.getSizeInBits() == 128);
13364 MVT MovTy;
13365 bool isAdvSIMDModImm = false;
13366
13367 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
13369 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
13370 }
13371 else if (isWide &&
13372 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
13374 MovTy = MVT::v2f64;
13375 }
13376
13377 if (isAdvSIMDModImm) {
13378 SDLoc dl(Op);
13379 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13380 DAG.getConstant(Value, dl, MVT::i32));
13381 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13382 }
13383 }
13384
13385 return SDValue();
13386}
13387
13388// Specialized code to quickly find if PotentialBVec is a BuildVector that
13389// consists of only the same constant int value, returned in reference arg
13390// ConstVal
13391static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13392 uint64_t &ConstVal) {
13393 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13394 if (!Bvec)
13395 return false;
13396 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13397 if (!FirstElt)
13398 return false;
13399 EVT VT = Bvec->getValueType(0);
13400 unsigned NumElts = VT.getVectorNumElements();
13401 for (unsigned i = 1; i < NumElts; ++i)
13402 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13403 return false;
13404 ConstVal = FirstElt->getZExtValue();
13405 return true;
13406}
13407
13409 // Look through cast.
13410 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13411 N = N.getOperand(0);
13412
13413 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13414}
13415
13417 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13418
13419 // Look through cast.
13420 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13421 N = N.getOperand(0);
13422 // When reinterpreting from a type with fewer elements the "new" elements
13423 // are not active, so bail if they're likely to be used.
13424 if (N.getValueType().getVectorMinNumElements() < NumElts)
13425 return false;
13426 }
13427
13428 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13429 return true;
13430
13431 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13432 // or smaller than the implicit element type represented by N.
13433 // NOTE: A larger element count implies a smaller element type.
13434 if (N.getOpcode() == AArch64ISD::PTRUE &&
13435 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13436 return N.getValueType().getVectorMinNumElements() >= NumElts;
13437
13438 // If we're compiling for a specific vector-length, we can check if the
13439 // pattern's VL equals that of the scalable vector at runtime.
13440 if (N.getOpcode() == AArch64ISD::PTRUE) {
13441 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13442 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13443 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13444 if (MaxSVESize && MinSVESize == MaxSVESize) {
13445 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13446 unsigned PatNumElts =
13447 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13448 return PatNumElts == (NumElts * VScale);
13449 }
13450 }
13451
13452 return false;
13453}
13454
13455// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13456// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13457// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13458// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13459// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13460// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13462 EVT VT = N->getValueType(0);
13463
13464 if (!VT.isVector())
13465 return SDValue();
13466
13467 SDLoc DL(N);
13468
13469 SDValue And;
13470 SDValue Shift;
13471
13472 SDValue FirstOp = N->getOperand(0);
13473 unsigned FirstOpc = FirstOp.getOpcode();
13474 SDValue SecondOp = N->getOperand(1);
13475 unsigned SecondOpc = SecondOp.getOpcode();
13476
13477 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13478 // a BICi in order to use an immediate instead of a register.
13479 // Is the other operand an shl or lshr? This will have been turned into:
13480 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13481 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13482 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13483 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13484 SecondOpc == AArch64ISD::SHL_PRED ||
13485 SecondOpc == AArch64ISD::SRL_PRED)) {
13486 And = FirstOp;
13487 Shift = SecondOp;
13488
13489 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13490 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13491 FirstOpc == AArch64ISD::SHL_PRED ||
13492 FirstOpc == AArch64ISD::SRL_PRED)) {
13493 And = SecondOp;
13494 Shift = FirstOp;
13495 } else
13496 return SDValue();
13497
13498 bool IsAnd = And.getOpcode() == ISD::AND;
13499 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13501 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13503
13504 // Is the shift amount constant and are all lanes active?
13505 uint64_t C2;
13506 if (ShiftHasPredOp) {
13507 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13508 return SDValue();
13509 APInt C;
13511 return SDValue();
13512 C2 = C.getZExtValue();
13513 } else if (ConstantSDNode *C2node =
13514 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13515 C2 = C2node->getZExtValue();
13516 else
13517 return SDValue();
13518
13519 APInt C1AsAPInt;
13520 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13521 if (IsAnd) {
13522 // Is the and mask vector all constant?
13523 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13524 return SDValue();
13525 } else {
13526 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13527 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13528 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13529 assert(C1nodeImm && C1nodeShift);
13530 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13531 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13532 }
13533
13534 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13535 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13536 // how much one can shift elements of a particular size?
13537 if (C2 > ElemSizeInBits)
13538 return SDValue();
13539
13540 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13541 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13542 if (C1AsAPInt != RequiredC1)
13543 return SDValue();
13544
13545 SDValue X = And.getOperand(0);
13546 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13547 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13548 : Shift.getOperand(1);
13549
13550 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13551 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13552
13553 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13554 LLVM_DEBUG(N->dump(&DAG));
13555 LLVM_DEBUG(dbgs() << "into: \n");
13556 LLVM_DEBUG(ResultSLI->dump(&DAG));
13557
13558 ++NumShiftInserts;
13559 return ResultSLI;
13560}
13561
13562SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13563 SelectionDAG &DAG) const {
13564 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13565 !Subtarget->isNeonAvailable()))
13566 return LowerToScalableOp(Op, DAG);
13567
13568 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13569 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13570 return Res;
13571
13572 EVT VT = Op.getValueType();
13573 if (VT.isScalableVector())
13574 return Op;
13575
13576 SDValue LHS = Op.getOperand(0);
13577 BuildVectorSDNode *BVN =
13578 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13579 if (!BVN) {
13580 // OR commutes, so try swapping the operands.
13581 LHS = Op.getOperand(1);
13582 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13583 }
13584 if (!BVN)
13585 return Op;
13586
13587 APInt DefBits(VT.getSizeInBits(), 0);
13588 APInt UndefBits(VT.getSizeInBits(), 0);
13589 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13590 SDValue NewOp;
13591
13592 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13593 DefBits, &LHS)) ||
13594 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13595 DefBits, &LHS)))
13596 return NewOp;
13597
13598 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13599 UndefBits, &LHS)) ||
13600 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13601 UndefBits, &LHS)))
13602 return NewOp;
13603 }
13604
13605 // We can always fall back to a non-immediate OR.
13606 return Op;
13607}
13608
13609// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13610// be truncated to fit element width.
13612 SelectionDAG &DAG) {
13613 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13614 SDLoc dl(Op);
13615 EVT VT = Op.getValueType();
13616 EVT EltTy= VT.getVectorElementType();
13617
13618 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13619 return Op;
13620
13622 for (SDValue Lane : Op->ops()) {
13623 // For integer vectors, type legalization would have promoted the
13624 // operands already. Otherwise, if Op is a floating-point splat
13625 // (with operands cast to integers), then the only possibilities
13626 // are constants and UNDEFs.
13627 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13628 APInt LowBits(EltTy.getSizeInBits(),
13629 CstLane->getZExtValue());
13630 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13631 } else if (Lane.getNode()->isUndef()) {
13632 Lane = DAG.getUNDEF(MVT::i32);
13633 } else {
13634 assert(Lane.getValueType() == MVT::i32 &&
13635 "Unexpected BUILD_VECTOR operand type");
13636 }
13637 Ops.push_back(Lane);
13638 }
13639 return DAG.getBuildVector(VT, dl, Ops);
13640}
13641
13643 const AArch64Subtarget *ST) {
13644 EVT VT = Op.getValueType();
13645 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13646 "Expected a legal NEON vector");
13647
13648 APInt DefBits(VT.getSizeInBits(), 0);
13649 APInt UndefBits(VT.getSizeInBits(), 0);
13650 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13651 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13652 auto TryMOVIWithBits = [&](APInt DefBits) {
13653 SDValue NewOp;
13654 if ((NewOp =
13655 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13656 (NewOp =
13657 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13658 (NewOp =
13659 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13660 (NewOp =
13661 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13662 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13663 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13664 return NewOp;
13665
13666 APInt NotDefBits = ~DefBits;
13667 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13668 NotDefBits)) ||
13670 NotDefBits)) ||
13671 (NewOp =
13672 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13673 return NewOp;
13674 return SDValue();
13675 };
13676 if (SDValue R = TryMOVIWithBits(DefBits))
13677 return R;
13678 if (SDValue R = TryMOVIWithBits(UndefBits))
13679 return R;
13680
13681 // See if a fneg of the constant can be materialized with a MOVI, etc
13682 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13683 // FNegate each sub-element of the constant
13684 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13685 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13686 .zext(VT.getSizeInBits());
13687 APInt NegBits(VT.getSizeInBits(), 0);
13688 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13689 for (unsigned i = 0; i < NumElts; i++)
13690 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13691 NegBits = DefBits ^ NegBits;
13692
13693 // Try to create the new constants with MOVI, and if so generate a fneg
13694 // for it.
13695 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13696 SDLoc DL(Op);
13697 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13698 return DAG.getNode(
13700 DAG.getNode(ISD::FNEG, DL, VFVT,
13701 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13702 }
13703 return SDValue();
13704 };
13705 SDValue R;
13706 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13707 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13708 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13709 return R;
13710 }
13711
13712 return SDValue();
13713}
13714
13715SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13716 SelectionDAG &DAG) const {
13717 EVT VT = Op.getValueType();
13718
13719 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13720 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13721 SDLoc DL(Op);
13722 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13723 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13724 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13725 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13726 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13727 }
13728
13729 // Revert to common legalisation for all other variants.
13730 return SDValue();
13731 }
13732
13733 // Try to build a simple constant vector.
13734 Op = NormalizeBuildVector(Op, DAG);
13735 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13736 // abort.
13737 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13738 return SDValue();
13739
13740 // Certain vector constants, used to express things like logical NOT and
13741 // arithmetic NEG, are passed through unmodified. This allows special
13742 // patterns for these operations to match, which will lower these constants
13743 // to whatever is proven necessary.
13744 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13745 if (BVN->isConstant()) {
13746 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13747 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13748 APInt Val(BitSize,
13749 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13750 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13751 return Op;
13752 }
13753 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13754 if (Const->isZero() && !Const->isNegative())
13755 return Op;
13756 }
13757
13758 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13759 return V;
13760
13761 // Scan through the operands to find some interesting properties we can
13762 // exploit:
13763 // 1) If only one value is used, we can use a DUP, or
13764 // 2) if only the low element is not undef, we can just insert that, or
13765 // 3) if only one constant value is used (w/ some non-constant lanes),
13766 // we can splat the constant value into the whole vector then fill
13767 // in the non-constant lanes.
13768 // 4) FIXME: If different constant values are used, but we can intelligently
13769 // select the values we'll be overwriting for the non-constant
13770 // lanes such that we can directly materialize the vector
13771 // some other way (MOVI, e.g.), we can be sneaky.
13772 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13773 SDLoc dl(Op);
13774 unsigned NumElts = VT.getVectorNumElements();
13775 bool isOnlyLowElement = true;
13776 bool usesOnlyOneValue = true;
13777 bool usesOnlyOneConstantValue = true;
13778 bool isConstant = true;
13779 bool AllLanesExtractElt = true;
13780 unsigned NumConstantLanes = 0;
13781 unsigned NumDifferentLanes = 0;
13782 unsigned NumUndefLanes = 0;
13783 SDValue Value;
13784 SDValue ConstantValue;
13785 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13786 unsigned ConsecutiveValCount = 0;
13787 SDValue PrevVal;
13788 for (unsigned i = 0; i < NumElts; ++i) {
13789 SDValue V = Op.getOperand(i);
13790 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13791 AllLanesExtractElt = false;
13792 if (V.isUndef()) {
13793 ++NumUndefLanes;
13794 continue;
13795 }
13796 if (i > 0)
13797 isOnlyLowElement = false;
13798 if (!isIntOrFPConstant(V))
13799 isConstant = false;
13800
13801 if (isIntOrFPConstant(V)) {
13802 ++NumConstantLanes;
13803 if (!ConstantValue.getNode())
13804 ConstantValue = V;
13805 else if (ConstantValue != V)
13806 usesOnlyOneConstantValue = false;
13807 }
13808
13809 if (!Value.getNode())
13810 Value = V;
13811 else if (V != Value) {
13812 usesOnlyOneValue = false;
13813 ++NumDifferentLanes;
13814 }
13815
13816 if (PrevVal != V) {
13817 ConsecutiveValCount = 0;
13818 PrevVal = V;
13819 }
13820
13821 // Keep different values and its last consecutive count. For example,
13822 //
13823 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13824 // t24, t24, t24, t24, t24, t24, t24, t24
13825 // t23 = consecutive count 8
13826 // t24 = consecutive count 8
13827 // ------------------------------------------------------------------
13828 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13829 // t24, t24, t24, t24, t24, t24, t24, t24
13830 // t23 = consecutive count 5
13831 // t24 = consecutive count 9
13832 DifferentValueMap[V] = ++ConsecutiveValCount;
13833 }
13834
13835 if (!Value.getNode()) {
13836 LLVM_DEBUG(
13837 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13838 return DAG.getUNDEF(VT);
13839 }
13840
13841 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13842 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13843 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13844 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13845 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13846 "SCALAR_TO_VECTOR node\n");
13847 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13848 }
13849
13850 if (AllLanesExtractElt) {
13851 SDNode *Vector = nullptr;
13852 bool Even = false;
13853 bool Odd = false;
13854 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13855 // the Odd pattern <1,3,5,...>.
13856 for (unsigned i = 0; i < NumElts; ++i) {
13857 SDValue V = Op.getOperand(i);
13858 const SDNode *N = V.getNode();
13859 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13860 Even = false;
13861 Odd = false;
13862 break;
13863 }
13864 SDValue N0 = N->getOperand(0);
13865
13866 // All elements are extracted from the same vector.
13867 if (!Vector) {
13868 Vector = N0.getNode();
13869 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13870 // BUILD_VECTOR.
13871 if (VT.getVectorElementType() !=
13873 break;
13874 } else if (Vector != N0.getNode()) {
13875 Odd = false;
13876 Even = false;
13877 break;
13878 }
13879
13880 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13881 // indices <1,3,5,...>.
13882 uint64_t Val = N->getConstantOperandVal(1);
13883 if (Val == 2 * i) {
13884 Even = true;
13885 continue;
13886 }
13887 if (Val - 1 == 2 * i) {
13888 Odd = true;
13889 continue;
13890 }
13891
13892 // Something does not match: abort.
13893 Odd = false;
13894 Even = false;
13895 break;
13896 }
13897 if (Even || Odd) {
13898 SDValue LHS =
13900 DAG.getConstant(0, dl, MVT::i64));
13901 SDValue RHS =
13903 DAG.getConstant(NumElts, dl, MVT::i64));
13904
13905 if (Even && !Odd)
13906 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
13907 if (Odd && !Even)
13908 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
13909 }
13910 }
13911
13912 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13913 // i32 and try again.
13914 if (usesOnlyOneValue) {
13915 if (!isConstant) {
13916 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13917 Value.getValueType() != VT) {
13918 LLVM_DEBUG(
13919 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13920 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13921 }
13922
13923 // This is actually a DUPLANExx operation, which keeps everything vectory.
13924
13925 SDValue Lane = Value.getOperand(1);
13926 Value = Value.getOperand(0);
13927 if (Value.getValueSizeInBits() == 64) {
13928 LLVM_DEBUG(
13929 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13930 "widening it\n");
13931 Value = WidenVector(Value, DAG);
13932 }
13933
13934 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13935 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13936 }
13937
13940 EVT EltTy = VT.getVectorElementType();
13941 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13942 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13943 LLVM_DEBUG(
13944 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13945 "BITCASTS, and try again\n");
13946 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13947 for (unsigned i = 0; i < NumElts; ++i)
13948 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13949 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13950 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13951 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13952 Val.dump(););
13953 Val = LowerBUILD_VECTOR(Val, DAG);
13954 if (Val.getNode())
13955 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13956 }
13957 }
13958
13959 // If we need to insert a small number of different non-constant elements and
13960 // the vector width is sufficiently large, prefer using DUP with the common
13961 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13962 // skip the constant lane handling below.
13963 bool PreferDUPAndInsert =
13964 !isConstant && NumDifferentLanes >= 1 &&
13965 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13966 NumDifferentLanes >= NumConstantLanes;
13967
13968 // If there was only one constant value used and for more than one lane,
13969 // start by splatting that value, then replace the non-constant lanes. This
13970 // is better than the default, which will perform a separate initialization
13971 // for each lane.
13972 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13973 // Firstly, try to materialize the splat constant.
13974 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13975 unsigned BitSize = VT.getScalarSizeInBits();
13976 APInt ConstantValueAPInt(1, 0);
13977 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13978 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13979 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13980 !ConstantValueAPInt.isAllOnes()) {
13981 Val = ConstantBuildVector(Val, DAG, Subtarget);
13982 if (!Val)
13983 // Otherwise, materialize the constant and splat it.
13984 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13985 }
13986
13987 // Now insert the non-constant lanes.
13988 for (unsigned i = 0; i < NumElts; ++i) {
13989 SDValue V = Op.getOperand(i);
13990 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13991 if (!isIntOrFPConstant(V))
13992 // Note that type legalization likely mucked about with the VT of the
13993 // source operand, so we may have to convert it here before inserting.
13994 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13995 }
13996 return Val;
13997 }
13998
13999 // This will generate a load from the constant pool.
14000 if (isConstant) {
14001 LLVM_DEBUG(
14002 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14003 "expansion\n");
14004 return SDValue();
14005 }
14006
14007 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14008 // v4i32s. This is really a truncate, which we can construct out of (legal)
14009 // concats and truncate nodes.
14011 return M;
14012
14013 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
14014 if (NumElts >= 4) {
14015 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
14016 return Shuffle;
14017
14018 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
14019 return Shuffle;
14020 }
14021
14022 if (PreferDUPAndInsert) {
14023 // First, build a constant vector with the common element.
14024 SmallVector<SDValue, 8> Ops(NumElts, Value);
14025 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
14026 // Next, insert the elements that do not match the common value.
14027 for (unsigned I = 0; I < NumElts; ++I)
14028 if (Op.getOperand(I) != Value)
14029 NewVector =
14030 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
14031 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
14032
14033 return NewVector;
14034 }
14035
14036 // If vector consists of two different values, try to generate two DUPs and
14037 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
14038 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
14040 // Check the consecutive count of the value is the half number of vector
14041 // elements. In this case, we can use CONCAT_VECTORS. For example,
14042 //
14043 // canUseVECTOR_CONCAT = true;
14044 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14045 // t24, t24, t24, t24, t24, t24, t24, t24
14046 //
14047 // canUseVECTOR_CONCAT = false;
14048 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
14049 // t24, t24, t24, t24, t24, t24, t24, t24
14050 bool canUseVECTOR_CONCAT = true;
14051 for (auto Pair : DifferentValueMap) {
14052 // Check different values have same length which is NumElts / 2.
14053 if (Pair.second != NumElts / 2)
14054 canUseVECTOR_CONCAT = false;
14055 Vals.push_back(Pair.first);
14056 }
14057
14058 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
14059 // CONCAT_VECTORs. For example,
14060 //
14061 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
14062 // t24, t24, t24, t24, t24, t24, t24, t24
14063 // ==>
14064 // t26: v8i8 = AArch64ISD::DUP t23
14065 // t28: v8i8 = AArch64ISD::DUP t24
14066 // t29: v16i8 = concat_vectors t26, t28
14067 if (canUseVECTOR_CONCAT) {
14068 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14069 if (isTypeLegal(SubVT) && SubVT.isVector() &&
14070 SubVT.getVectorNumElements() >= 2) {
14071 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
14072 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
14073 SDValue DUP1 =
14074 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
14075 SDValue DUP2 =
14076 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
14078 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
14079 return CONCAT_VECTORS;
14080 }
14081 }
14082
14083 // Let's try to generate VECTOR_SHUFFLE. For example,
14084 //
14085 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
14086 // ==>
14087 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
14088 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
14089 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
14090 if (NumElts >= 8) {
14091 SmallVector<int, 16> MaskVec;
14092 // Build mask for VECTOR_SHUFLLE.
14093 SDValue FirstLaneVal = Op.getOperand(0);
14094 for (unsigned i = 0; i < NumElts; ++i) {
14095 SDValue Val = Op.getOperand(i);
14096 if (FirstLaneVal == Val)
14097 MaskVec.push_back(i);
14098 else
14099 MaskVec.push_back(i + NumElts);
14100 }
14101
14102 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
14103 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
14104 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
14105 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
14107 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
14108 return VECTOR_SHUFFLE;
14109 }
14110 }
14111
14112 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
14113 // know the default expansion would otherwise fall back on something even
14114 // worse. For a vector with one or two non-undef values, that's
14115 // scalar_to_vector for the elements followed by a shuffle (provided the
14116 // shuffle is valid for the target) and materialization element by element
14117 // on the stack followed by a load for everything else.
14118 if (!isConstant && !usesOnlyOneValue) {
14119 LLVM_DEBUG(
14120 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
14121 "of INSERT_VECTOR_ELT\n");
14122
14123 SDValue Vec = DAG.getUNDEF(VT);
14124 SDValue Op0 = Op.getOperand(0);
14125 unsigned i = 0;
14126
14127 // Use SCALAR_TO_VECTOR for lane zero to
14128 // a) Avoid a RMW dependency on the full vector register, and
14129 // b) Allow the register coalescer to fold away the copy if the
14130 // value is already in an S or D register, and we're forced to emit an
14131 // INSERT_SUBREG that we can't fold anywhere.
14132 //
14133 // We also allow types like i8 and i16 which are illegal scalar but legal
14134 // vector element types. After type-legalization the inserted value is
14135 // extended (i32) and it is safe to cast them to the vector type by ignoring
14136 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
14137 if (!Op0.isUndef()) {
14138 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
14139 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
14140 ++i;
14141 }
14142 LLVM_DEBUG(if (i < NumElts) dbgs()
14143 << "Creating nodes for the other vector elements:\n";);
14144 for (; i < NumElts; ++i) {
14145 SDValue V = Op.getOperand(i);
14146 if (V.isUndef())
14147 continue;
14148 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14149 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
14150 }
14151 return Vec;
14152 }
14153
14154 LLVM_DEBUG(
14155 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
14156 "better alternative\n");
14157 return SDValue();
14158}
14159
14160SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
14161 SelectionDAG &DAG) const {
14162 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14163 !Subtarget->isNeonAvailable()))
14164 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
14165
14166 assert(Op.getValueType().isScalableVector() &&
14167 isTypeLegal(Op.getValueType()) &&
14168 "Expected legal scalable vector type!");
14169
14170 if (isTypeLegal(Op.getOperand(0).getValueType())) {
14171 unsigned NumOperands = Op->getNumOperands();
14172 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
14173 "Unexpected number of operands in CONCAT_VECTORS");
14174
14175 if (NumOperands == 2)
14176 return Op;
14177
14178 // Concat each pair of subvectors and pack into the lower half of the array.
14179 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
14180 while (ConcatOps.size() > 1) {
14181 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
14182 SDValue V1 = ConcatOps[I];
14183 SDValue V2 = ConcatOps[I + 1];
14184 EVT SubVT = V1.getValueType();
14185 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
14186 ConcatOps[I / 2] =
14187 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
14188 }
14189 ConcatOps.resize(ConcatOps.size() / 2);
14190 }
14191 return ConcatOps[0];
14192 }
14193
14194 return SDValue();
14195}
14196
14197SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14198 SelectionDAG &DAG) const {
14199 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
14200
14201 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14202 !Subtarget->isNeonAvailable()))
14203 return LowerFixedLengthInsertVectorElt(Op, DAG);
14204
14205 EVT VT = Op.getOperand(0).getValueType();
14206
14207 if (VT.getScalarType() == MVT::i1) {
14208 EVT VectorVT = getPromotedVTForPredicate(VT);
14209 SDLoc DL(Op);
14210 SDValue ExtendedVector =
14211 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
14212 SDValue ExtendedValue =
14213 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
14214 VectorVT.getScalarType().getSizeInBits() < 32
14215 ? MVT::i32
14216 : VectorVT.getScalarType());
14217 ExtendedVector =
14218 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
14219 ExtendedValue, Op.getOperand(2));
14220 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
14221 }
14222
14223 // Check for non-constant or out of range lane.
14224 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
14225 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14226 return SDValue();
14227
14228 return Op;
14229}
14230
14231SDValue
14232AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14233 SelectionDAG &DAG) const {
14234 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
14235 EVT VT = Op.getOperand(0).getValueType();
14236
14237 if (VT.getScalarType() == MVT::i1) {
14238 // We can't directly extract from an SVE predicate; extend it first.
14239 // (This isn't the only possible lowering, but it's straightforward.)
14240 EVT VectorVT = getPromotedVTForPredicate(VT);
14241 SDLoc DL(Op);
14242 SDValue Extend =
14243 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
14244 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
14245 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
14246 Extend, Op.getOperand(1));
14247 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
14248 }
14249
14250 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14251 return LowerFixedLengthExtractVectorElt(Op, DAG);
14252
14253 // Check for non-constant or out of range lane.
14254 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14255 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14256 return SDValue();
14257
14258 // Insertion/extraction are legal for V128 types.
14259 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
14260 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
14261 VT == MVT::v8f16 || VT == MVT::v8bf16)
14262 return Op;
14263
14264 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
14265 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
14266 VT != MVT::v4bf16)
14267 return SDValue();
14268
14269 // For V64 types, we perform extraction by expanding the value
14270 // to a V128 type and perform the extraction on that.
14271 SDLoc DL(Op);
14272 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
14273 EVT WideTy = WideVec.getValueType();
14274
14275 EVT ExtrTy = WideTy.getVectorElementType();
14276 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
14277 ExtrTy = MVT::i32;
14278
14279 // For extractions, we just return the result directly.
14280 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
14281 Op.getOperand(1));
14282}
14283
14284SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14285 SelectionDAG &DAG) const {
14286 EVT VT = Op.getValueType();
14288 "Only cases that extract a fixed length vector are supported!");
14289 EVT InVT = Op.getOperand(0).getValueType();
14290
14291 // If we don't have legal types yet, do nothing
14292 if (!isTypeLegal(InVT))
14293 return SDValue();
14294
14295 if (InVT.is128BitVector()) {
14296 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
14297 unsigned Idx = Op.getConstantOperandVal(1);
14298
14299 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
14300 if (Idx == 0)
14301 return Op;
14302
14303 // If this is extracting the upper 64-bits of a 128-bit vector, we match
14304 // that directly.
14305 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
14306 return Op;
14307 }
14308
14309 if (InVT.isScalableVector() ||
14310 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
14311 SDLoc DL(Op);
14312 SDValue Vec = Op.getOperand(0);
14313 SDValue Idx = Op.getOperand(1);
14314
14316 if (PackedVT != InVT) {
14317 // Pack input into the bottom part of an SVE register and try again.
14318 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
14319 DAG.getUNDEF(PackedVT), Vec,
14320 DAG.getVectorIdxConstant(0, DL));
14321 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
14322 }
14323
14324 // This will get matched by custom code during ISelDAGToDAG.
14325 if (isNullConstant(Idx))
14326 return Op;
14327
14328 assert(InVT.isScalableVector() && "Unexpected vector type!");
14329 // Move requested subvector to the start of the vector and try again.
14330 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
14331 return convertFromScalableVector(DAG, VT, Splice);
14332 }
14333
14334 return SDValue();
14335}
14336
14337SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
14338 SelectionDAG &DAG) const {
14339 assert(Op.getValueType().isScalableVector() &&
14340 "Only expect to lower inserts into scalable vectors!");
14341
14342 EVT InVT = Op.getOperand(1).getValueType();
14343 unsigned Idx = Op.getConstantOperandVal(2);
14344
14345 SDValue Vec0 = Op.getOperand(0);
14346 SDValue Vec1 = Op.getOperand(1);
14347 SDLoc DL(Op);
14348 EVT VT = Op.getValueType();
14349
14350 if (InVT.isScalableVector()) {
14351 if (!isTypeLegal(VT))
14352 return SDValue();
14353
14354 // Break down insert_subvector into simpler parts.
14355 if (VT.getVectorElementType() == MVT::i1) {
14356 unsigned NumElts = VT.getVectorMinNumElements();
14357 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14358
14359 SDValue Lo, Hi;
14360 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
14361 DAG.getVectorIdxConstant(0, DL));
14362 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
14363 DAG.getVectorIdxConstant(NumElts / 2, DL));
14364 if (Idx < (NumElts / 2))
14365 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
14367 else
14368 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
14369 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
14370
14371 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14372 }
14373
14374 // Ensure the subvector is half the size of the main vector.
14375 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
14376 return SDValue();
14377
14378 // Here narrow and wide refers to the vector element types. After "casting"
14379 // both vectors must have the same bit length and so because the subvector
14380 // has fewer elements, those elements need to be bigger.
14383
14384 // NOP cast operands to the largest legal vector of the same element count.
14385 if (VT.isFloatingPoint()) {
14386 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
14387 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
14388 } else {
14389 // Legal integer vectors are already their largest so Vec0 is fine as is.
14390 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
14391 }
14392
14393 // To replace the top/bottom half of vector V with vector SubV we widen the
14394 // preserved half of V, concatenate this to SubV (the order depending on the
14395 // half being replaced) and then narrow the result.
14396 SDValue Narrow;
14397 if (Idx == 0) {
14398 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14399 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14400 } else {
14402 "Invalid subvector index!");
14403 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14404 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14405 }
14406
14407 return getSVESafeBitCast(VT, Narrow, DAG);
14408 }
14409
14410 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14411 // This will be matched by custom code during ISelDAGToDAG.
14412 if (Vec0.isUndef())
14413 return Op;
14414
14415 std::optional<unsigned> PredPattern =
14417 auto PredTy = VT.changeVectorElementType(MVT::i1);
14418 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14419 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14420 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14421 }
14422
14423 return SDValue();
14424}
14425
14426static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14427 if (Op.getOpcode() != AArch64ISD::DUP &&
14428 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14429 Op.getOpcode() != ISD::BUILD_VECTOR)
14430 return false;
14431
14432 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14433 !isAllConstantBuildVector(Op, SplatVal))
14434 return false;
14435
14436 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14437 !isa<ConstantSDNode>(Op->getOperand(0)))
14438 return false;
14439
14440 SplatVal = Op->getConstantOperandVal(0);
14441 if (Op.getValueType().getVectorElementType() != MVT::i64)
14442 SplatVal = (int32_t)SplatVal;
14443
14444 Negated = false;
14445 if (isPowerOf2_64(SplatVal))
14446 return true;
14447
14448 Negated = true;
14449 if (isPowerOf2_64(-SplatVal)) {
14450 SplatVal = -SplatVal;
14451 return true;
14452 }
14453
14454 return false;
14455}
14456
14457SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14458 EVT VT = Op.getValueType();
14459 SDLoc dl(Op);
14460
14461 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14462 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14463
14464 assert(VT.isScalableVector() && "Expected a scalable vector.");
14465
14466 bool Signed = Op.getOpcode() == ISD::SDIV;
14467 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14468
14469 bool Negated;
14470 uint64_t SplatVal;
14471 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14472 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14473 SDValue Res =
14474 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14475 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14476 if (Negated)
14477 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14478
14479 return Res;
14480 }
14481
14482 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14483 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14484
14485 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14486 // operations, and truncate the result.
14487 EVT WidenedVT;
14488 if (VT == MVT::nxv16i8)
14489 WidenedVT = MVT::nxv8i16;
14490 else if (VT == MVT::nxv8i16)
14491 WidenedVT = MVT::nxv4i32;
14492 else
14493 llvm_unreachable("Unexpected Custom DIV operation");
14494
14495 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14496 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14497 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14498 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14499 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14500 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14501 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14502 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14503 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14504}
14505
14506bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
14507 EVT VT, unsigned DefinedValues) const {
14508 if (!Subtarget->isNeonAvailable())
14509 return false;
14511}
14512
14514 // Currently no fixed length shuffles that require SVE are legal.
14515 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14516 return false;
14517
14518 if (VT.getVectorNumElements() == 4 &&
14519 (VT.is128BitVector() || VT.is64BitVector())) {
14520 unsigned Cost = getPerfectShuffleCost(M);
14521 if (Cost <= 1)
14522 return true;
14523 }
14524
14525 bool DummyBool;
14526 int DummyInt;
14527 unsigned DummyUnsigned;
14528
14529 unsigned EltSize = VT.getScalarSizeInBits();
14530 unsigned NumElts = VT.getVectorNumElements();
14531 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
14532 isREVMask(M, EltSize, NumElts, 64) ||
14533 isREVMask(M, EltSize, NumElts, 32) ||
14534 isREVMask(M, EltSize, NumElts, 16) ||
14535 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14536 isTRNMask(M, NumElts, DummyUnsigned) ||
14537 isUZPMask(M, NumElts, DummyUnsigned) ||
14538 isZIPMask(M, NumElts, DummyUnsigned) ||
14539 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14540 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14541 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14542 isINSMask(M, NumElts, DummyBool, DummyInt) ||
14543 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14544}
14545
14547 EVT VT) const {
14548 // Just delegate to the generic legality, clear masks aren't special.
14549 return isShuffleMaskLegal(M, VT);
14550}
14551
14552/// getVShiftImm - Check if this is a valid build_vector for the immediate
14553/// operand of a vector shift operation, where all the elements of the
14554/// build_vector must have the same constant integer value.
14555static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14556 // Ignore bit_converts.
14557 while (Op.getOpcode() == ISD::BITCAST)
14558 Op = Op.getOperand(0);
14559 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14560 APInt SplatBits, SplatUndef;
14561 unsigned SplatBitSize;
14562 bool HasAnyUndefs;
14563 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14564 HasAnyUndefs, ElementBits) ||
14565 SplatBitSize > ElementBits)
14566 return false;
14567 Cnt = SplatBits.getSExtValue();
14568 return true;
14569}
14570
14571/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14572/// operand of a vector shift left operation. That value must be in the range:
14573/// 0 <= Value < ElementBits for a left shift; or
14574/// 0 <= Value <= ElementBits for a long left shift.
14575static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14576 assert(VT.isVector() && "vector shift count is not a vector type");
14577 int64_t ElementBits = VT.getScalarSizeInBits();
14578 if (!getVShiftImm(Op, ElementBits, Cnt))
14579 return false;
14580 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14581}
14582
14583/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14584/// operand of a vector shift right operation. The value must be in the range:
14585/// 1 <= Value <= ElementBits for a right shift; or
14586static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14587 assert(VT.isVector() && "vector shift count is not a vector type");
14588 int64_t ElementBits = VT.getScalarSizeInBits();
14589 if (!getVShiftImm(Op, ElementBits, Cnt))
14590 return false;
14591 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14592}
14593
14594SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14595 SelectionDAG &DAG) const {
14596 EVT VT = Op.getValueType();
14597
14598 if (VT.getScalarType() == MVT::i1) {
14599 // Lower i1 truncate to `(x & 1) != 0`.
14600 SDLoc dl(Op);
14601 EVT OpVT = Op.getOperand(0).getValueType();
14602 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14603 SDValue One = DAG.getConstant(1, dl, OpVT);
14604 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14605 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14606 }
14607
14608 if (!VT.isVector() || VT.isScalableVector())
14609 return SDValue();
14610
14611 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14612 !Subtarget->isNeonAvailable()))
14613 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14614
14615 return SDValue();
14616}
14617
14618// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14619// possibly a truncated type, it tells how many bits of the value are to be
14620// used.
14622 SelectionDAG &DAG,
14623 unsigned &ShiftValue,
14624 SDValue &RShOperand) {
14625 if (Shift->getOpcode() != ISD::SRL)
14626 return false;
14627
14628 EVT VT = Shift.getValueType();
14629 assert(VT.isScalableVT());
14630
14631 auto ShiftOp1 =
14632 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14633 if (!ShiftOp1)
14634 return false;
14635
14636 ShiftValue = ShiftOp1->getZExtValue();
14637 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14638 return false;
14639
14640 SDValue Add = Shift->getOperand(0);
14641 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14642 return false;
14643
14645 "ResVT must be truncated or same type as the shift.");
14646 // Check if an overflow can lead to incorrect results.
14647 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14648 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14649 return false;
14650
14651 auto AddOp1 =
14652 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14653 if (!AddOp1)
14654 return false;
14655 uint64_t AddValue = AddOp1->getZExtValue();
14656 if (AddValue != 1ULL << (ShiftValue - 1))
14657 return false;
14658
14659 RShOperand = Add->getOperand(0);
14660 return true;
14661}
14662
14663SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14664 SelectionDAG &DAG) const {
14665 EVT VT = Op.getValueType();
14666 SDLoc DL(Op);
14667 int64_t Cnt;
14668
14669 if (!Op.getOperand(1).getValueType().isVector())
14670 return Op;
14671 unsigned EltSize = VT.getScalarSizeInBits();
14672
14673 switch (Op.getOpcode()) {
14674 case ISD::SHL:
14675 if (VT.isScalableVector() ||
14677 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14678
14679 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14680 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14681 DAG.getConstant(Cnt, DL, MVT::i32));
14682 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14683 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14684 MVT::i32),
14685 Op.getOperand(0), Op.getOperand(1));
14686 case ISD::SRA:
14687 case ISD::SRL:
14688 if (VT.isScalableVector() &&
14689 (Subtarget->hasSVE2() ||
14690 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
14691 SDValue RShOperand;
14692 unsigned ShiftValue;
14693 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14694 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14695 getPredicateForVector(DAG, DL, VT), RShOperand,
14696 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14697 }
14698
14699 if (VT.isScalableVector() ||
14700 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14701 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14703 return LowerToPredicatedOp(Op, DAG, Opc);
14704 }
14705
14706 // Right shift immediate
14707 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14708 unsigned Opc =
14709 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14710 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14711 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
14712 }
14713
14714 // Right shift register. Note, there is not a shift right register
14715 // instruction, but the shift left register instruction takes a signed
14716 // value, where negative numbers specify a right shift.
14717 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14718 : Intrinsic::aarch64_neon_ushl;
14719 // negate the shift amount
14720 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14721 Op.getOperand(1));
14722 SDValue NegShiftLeft =
14724 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14725 NegShift);
14726 return NegShiftLeft;
14727 }
14728
14729 llvm_unreachable("unexpected shift opcode");
14730}
14731
14733 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14734 const SDLoc &dl, SelectionDAG &DAG) {
14735 EVT SrcVT = LHS.getValueType();
14736 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14737 "function only supposed to emit natural comparisons");
14738
14739 APInt SplatValue;
14740 APInt SplatUndef;
14741 unsigned SplatBitSize = 0;
14742 bool HasAnyUndefs;
14743
14744 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14745 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14746 SplatBitSize, HasAnyUndefs);
14747
14748 bool IsZero = IsCnst && SplatValue == 0;
14749 bool IsOne =
14750 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14751 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14752
14753 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14754 switch (CC) {
14755 default:
14756 return SDValue();
14757 case AArch64CC::NE: {
14758 SDValue Fcmeq;
14759 if (IsZero)
14760 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14761 else
14762 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14763 return DAG.getNOT(dl, Fcmeq, VT);
14764 }
14765 case AArch64CC::EQ:
14766 if (IsZero)
14767 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14768 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14769 case AArch64CC::GE:
14770 if (IsZero)
14771 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14772 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14773 case AArch64CC::GT:
14774 if (IsZero)
14775 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14776 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14777 case AArch64CC::LE:
14778 if (!NoNans)
14779 return SDValue();
14780 // If we ignore NaNs then we can use to the LS implementation.
14781 [[fallthrough]];
14782 case AArch64CC::LS:
14783 if (IsZero)
14784 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14785 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14786 case AArch64CC::LT:
14787 if (!NoNans)
14788 return SDValue();
14789 // If we ignore NaNs then we can use to the MI implementation.
14790 [[fallthrough]];
14791 case AArch64CC::MI:
14792 if (IsZero)
14793 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14794 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14795 }
14796 }
14797
14798 switch (CC) {
14799 default:
14800 return SDValue();
14801 case AArch64CC::NE: {
14802 SDValue Cmeq;
14803 if (IsZero)
14804 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14805 else
14806 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14807 return DAG.getNOT(dl, Cmeq, VT);
14808 }
14809 case AArch64CC::EQ:
14810 if (IsZero)
14811 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14812 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14813 case AArch64CC::GE:
14814 if (IsZero)
14815 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14816 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14817 case AArch64CC::GT:
14818 if (IsZero)
14819 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14820 if (IsMinusOne)
14821 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14822 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14823 case AArch64CC::LE:
14824 if (IsZero)
14825 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14826 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14827 case AArch64CC::LS:
14828 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14829 case AArch64CC::LO:
14830 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14831 case AArch64CC::LT:
14832 if (IsZero)
14833 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14834 if (IsOne)
14835 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14836 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14837 case AArch64CC::HI:
14838 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14839 case AArch64CC::HS:
14840 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14841 }
14842}
14843
14844SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14845 SelectionDAG &DAG) const {
14846 if (Op.getValueType().isScalableVector())
14847 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14848
14849 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14850 !Subtarget->isNeonAvailable()))
14851 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14852
14853 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14854 SDValue LHS = Op.getOperand(0);
14855 SDValue RHS = Op.getOperand(1);
14856 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14857 SDLoc dl(Op);
14858
14859 if (LHS.getValueType().getVectorElementType().isInteger()) {
14860 assert(LHS.getValueType() == RHS.getValueType());
14862 SDValue Cmp =
14863 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14864 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14865 }
14866
14867 // Lower isnan(x) | isnan(never-nan) to x != x.
14868 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14869 if (CC == ISD::SETUO || CC == ISD::SETO) {
14870 bool OneNaN = false;
14871 if (LHS == RHS) {
14872 OneNaN = true;
14873 } else if (DAG.isKnownNeverNaN(RHS)) {
14874 OneNaN = true;
14875 RHS = LHS;
14876 } else if (DAG.isKnownNeverNaN(LHS)) {
14877 OneNaN = true;
14878 LHS = RHS;
14879 }
14880 if (OneNaN) {
14882 }
14883 }
14884
14885 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14886
14887 // Make v4f16 (only) fcmp operations utilise vector instructions
14888 // v8f16 support will be a litle more complicated
14889 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14890 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14891 if (LHS.getValueType().getVectorNumElements() == 4) {
14892 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14893 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14894 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14895 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14896 CmpVT = MVT::v4i32;
14897 } else
14898 return SDValue();
14899 }
14900
14901 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14902 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14903 LHS.getValueType().getVectorElementType() != MVT::f128);
14904
14905 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14906 // clean. Some of them require two branches to implement.
14907 AArch64CC::CondCode CC1, CC2;
14908 bool ShouldInvert;
14909 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14910
14911 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14912 SDValue Cmp =
14913 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14914 if (!Cmp.getNode())
14915 return SDValue();
14916
14917 if (CC2 != AArch64CC::AL) {
14918 SDValue Cmp2 =
14919 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14920 if (!Cmp2.getNode())
14921 return SDValue();
14922
14923 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14924 }
14925
14926 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14927
14928 if (ShouldInvert)
14929 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14930
14931 return Cmp;
14932}
14933
14934static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14935 SelectionDAG &DAG) {
14936 SDValue VecOp = ScalarOp.getOperand(0);
14937 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14938 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14939 DAG.getConstant(0, DL, MVT::i64));
14940}
14941
14942static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14943 SDLoc DL, SelectionDAG &DAG) {
14944 unsigned ScalarOpcode;
14945 switch (Opcode) {
14946 case ISD::VECREDUCE_AND:
14947 ScalarOpcode = ISD::AND;
14948 break;
14949 case ISD::VECREDUCE_OR:
14950 ScalarOpcode = ISD::OR;
14951 break;
14952 case ISD::VECREDUCE_XOR:
14953 ScalarOpcode = ISD::XOR;
14954 break;
14955 default:
14956 llvm_unreachable("Expected bitwise vector reduction");
14957 return SDValue();
14958 }
14959
14960 EVT VecVT = Vec.getValueType();
14961 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14962 "Expected power-of-2 length vector");
14963
14964 EVT ElemVT = VecVT.getVectorElementType();
14965
14966 SDValue Result;
14967 unsigned NumElems = VecVT.getVectorNumElements();
14968
14969 // Special case for boolean reductions
14970 if (ElemVT == MVT::i1) {
14971 // Split large vectors into smaller ones
14972 if (NumElems > 16) {
14973 SDValue Lo, Hi;
14974 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14975 EVT HalfVT = Lo.getValueType();
14976 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14977 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14978 }
14979
14980 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14981 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14982 // this element size leads to the best codegen, since e.g. setcc results
14983 // might need to be truncated otherwise.
14984 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14985
14986 // any_ext doesn't work with umin/umax, so only use it for uadd.
14987 unsigned ExtendOp =
14988 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14989 SDValue Extended = DAG.getNode(
14990 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14991 switch (ScalarOpcode) {
14992 case ISD::AND:
14993 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14994 break;
14995 case ISD::OR:
14996 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14997 break;
14998 case ISD::XOR:
14999 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
15000 break;
15001 default:
15002 llvm_unreachable("Unexpected Opcode");
15003 }
15004
15005 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
15006 } else {
15007 // Iteratively split the vector in half and combine using the bitwise
15008 // operation until it fits in a 64 bit register.
15009 while (VecVT.getSizeInBits() > 64) {
15010 SDValue Lo, Hi;
15011 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15012 VecVT = Lo.getValueType();
15013 NumElems = VecVT.getVectorNumElements();
15014 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
15015 }
15016
15017 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
15018
15019 // Do the remaining work on a scalar since it allows the code generator to
15020 // combine the shift and bitwise operation into one instruction and since
15021 // integer instructions can have higher throughput than vector instructions.
15022 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
15023
15024 // Iteratively combine the lower and upper halves of the scalar using the
15025 // bitwise operation, halving the relevant region of the scalar in each
15026 // iteration, until the relevant region is just one element of the original
15027 // vector.
15028 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
15029 SDValue ShiftAmount =
15030 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
15031 SDValue Shifted =
15032 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
15033 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
15034 }
15035
15036 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
15037 }
15038
15039 return DAG.getAnyExtOrTrunc(Result, DL, VT);
15040}
15041
15042SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
15043 SelectionDAG &DAG) const {
15044 SDValue Src = Op.getOperand(0);
15045
15046 // Try to lower fixed length reductions to SVE.
15047 EVT SrcVT = Src.getValueType();
15048 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15049 Op.getOpcode() == ISD::VECREDUCE_AND ||
15050 Op.getOpcode() == ISD::VECREDUCE_OR ||
15051 Op.getOpcode() == ISD::VECREDUCE_XOR ||
15052 Op.getOpcode() == ISD::VECREDUCE_FADD ||
15053 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
15054 SrcVT.getVectorElementType() == MVT::i64);
15055 if (SrcVT.isScalableVector() ||
15057 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
15058
15059 if (SrcVT.getVectorElementType() == MVT::i1)
15060 return LowerPredReductionToSVE(Op, DAG);
15061
15062 switch (Op.getOpcode()) {
15063 case ISD::VECREDUCE_ADD:
15064 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
15065 case ISD::VECREDUCE_AND:
15066 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
15067 case ISD::VECREDUCE_OR:
15068 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
15070 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
15072 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
15074 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
15076 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
15077 case ISD::VECREDUCE_XOR:
15078 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
15080 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
15082 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
15084 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
15086 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
15088 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
15089 default:
15090 llvm_unreachable("Unhandled fixed length reduction");
15091 }
15092 }
15093
15094 // Lower NEON reductions.
15095 SDLoc dl(Op);
15096 switch (Op.getOpcode()) {
15097 case ISD::VECREDUCE_AND:
15098 case ISD::VECREDUCE_OR:
15099 case ISD::VECREDUCE_XOR:
15100 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
15101 Op.getValueType(), dl, DAG);
15102 case ISD::VECREDUCE_ADD:
15103 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
15105 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
15107 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
15109 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
15111 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
15112 default:
15113 llvm_unreachable("Unhandled reduction");
15114 }
15115}
15116
15117SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
15118 SelectionDAG &DAG) const {
15119 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15120 // No point replacing if we don't have the relevant instruction/libcall anyway
15121 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
15122 return SDValue();
15123
15124 // LSE has an atomic load-clear instruction, but not a load-and.
15125 SDLoc dl(Op);
15126 MVT VT = Op.getSimpleValueType();
15127 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
15128 SDValue RHS = Op.getOperand(2);
15129 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
15130 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
15131 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
15132 Op.getOperand(0), Op.getOperand(1), RHS,
15133 AN->getMemOperand());
15134}
15135
15136SDValue
15137AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
15138 SelectionDAG &DAG) const {
15139
15140 SDLoc dl(Op);
15141 // Get the inputs.
15142 SDNode *Node = Op.getNode();
15143 SDValue Chain = Op.getOperand(0);
15144 SDValue Size = Op.getOperand(1);
15146 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
15147 EVT VT = Node->getValueType(0);
15148
15150 "no-stack-arg-probe")) {
15151 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15152 Chain = SP.getValue(1);
15153 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15154 if (Align)
15155 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15156 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15157 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
15158 SDValue Ops[2] = {SP, Chain};
15159 return DAG.getMergeValues(Ops, dl);
15160 }
15161
15162 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
15163
15164 EVT PtrVT = getPointerTy(DAG.getDataLayout());
15166 PtrVT, 0);
15167
15168 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
15169 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
15170 if (Subtarget->hasCustomCallingConv())
15171 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
15172
15173 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
15174 DAG.getConstant(4, dl, MVT::i64));
15175 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
15176 Chain =
15177 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
15178 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
15179 DAG.getRegisterMask(Mask), Chain.getValue(1));
15180 // To match the actual intent better, we should read the output from X15 here
15181 // again (instead of potentially spilling it to the stack), but rereading Size
15182 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
15183 // here.
15184
15185 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
15186 DAG.getConstant(4, dl, MVT::i64));
15187
15188 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15189 Chain = SP.getValue(1);
15190 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15191 if (Align)
15192 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15193 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15194 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
15195
15196 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
15197
15198 SDValue Ops[2] = {SP, Chain};
15199 return DAG.getMergeValues(Ops, dl);
15200}
15201
15202SDValue
15203AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
15204 SelectionDAG &DAG) const {
15205 // Get the inputs.
15206 SDNode *Node = Op.getNode();
15207 SDValue Chain = Op.getOperand(0);
15208 SDValue Size = Op.getOperand(1);
15209
15211 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
15212 SDLoc dl(Op);
15213 EVT VT = Node->getValueType(0);
15214
15215 // Construct the new SP value in a GPR.
15216 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15217 Chain = SP.getValue(1);
15218 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15219 if (Align)
15220 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15221 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15222
15223 // Set the real SP to the new value with a probing loop.
15224 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
15225 SDValue Ops[2] = {SP, Chain};
15226 return DAG.getMergeValues(Ops, dl);
15227}
15228
15229SDValue
15230AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
15231 SelectionDAG &DAG) const {
15233
15234 if (Subtarget->isTargetWindows())
15235 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
15236 else if (hasInlineStackProbe(MF))
15237 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
15238 else
15239 return SDValue();
15240}
15241
15242SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
15243 unsigned NewOp) const {
15244 if (Subtarget->hasSVE2())
15245 return LowerToPredicatedOp(Op, DAG, NewOp);
15246
15247 // Default to expand.
15248 return SDValue();
15249}
15250
15251SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
15252 SelectionDAG &DAG) const {
15253 EVT VT = Op.getValueType();
15254 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
15255
15256 SDLoc DL(Op);
15257 APInt MulImm = Op.getConstantOperandAPInt(0);
15258 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
15259 VT);
15260}
15261
15262/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
15263template <unsigned NumVecs>
15264static bool
15268 // Retrieve EC from first vector argument.
15269 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
15271#ifndef NDEBUG
15272 // Check the assumption that all input vectors are the same type.
15273 for (unsigned I = 0; I < NumVecs; ++I)
15274 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
15275 "Invalid type.");
15276#endif
15277 // memVT is `NumVecs * VT`.
15279 EC * NumVecs);
15280 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
15281 Info.offset = 0;
15282 Info.align.reset();
15284 return true;
15285}
15286
15287/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
15288/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
15289/// specified in the intrinsic calls.
15291 const CallInst &I,
15292 MachineFunction &MF,
15293 unsigned Intrinsic) const {
15294 auto &DL = I.getDataLayout();
15295 switch (Intrinsic) {
15296 case Intrinsic::aarch64_sve_st2:
15297 return setInfoSVEStN<2>(*this, DL, Info, I);
15298 case Intrinsic::aarch64_sve_st3:
15299 return setInfoSVEStN<3>(*this, DL, Info, I);
15300 case Intrinsic::aarch64_sve_st4:
15301 return setInfoSVEStN<4>(*this, DL, Info, I);
15302 case Intrinsic::aarch64_neon_ld2:
15303 case Intrinsic::aarch64_neon_ld3:
15304 case Intrinsic::aarch64_neon_ld4:
15305 case Intrinsic::aarch64_neon_ld1x2:
15306 case Intrinsic::aarch64_neon_ld1x3:
15307 case Intrinsic::aarch64_neon_ld1x4: {
15309 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
15310 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15311 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15312 Info.offset = 0;
15313 Info.align.reset();
15314 // volatile loads with NEON intrinsics not supported
15316 return true;
15317 }
15318 case Intrinsic::aarch64_neon_ld2lane:
15319 case Intrinsic::aarch64_neon_ld3lane:
15320 case Intrinsic::aarch64_neon_ld4lane:
15321 case Intrinsic::aarch64_neon_ld2r:
15322 case Intrinsic::aarch64_neon_ld3r:
15323 case Intrinsic::aarch64_neon_ld4r: {
15325 // ldx return struct with the same vec type
15326 Type *RetTy = I.getType();
15327 auto *StructTy = cast<StructType>(RetTy);
15328 unsigned NumElts = StructTy->getNumElements();
15329 Type *VecTy = StructTy->getElementType(0);
15330 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15331 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15332 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15333 Info.offset = 0;
15334 Info.align.reset();
15335 // volatile loads with NEON intrinsics not supported
15337 return true;
15338 }
15339 case Intrinsic::aarch64_neon_st2:
15340 case Intrinsic::aarch64_neon_st3:
15341 case Intrinsic::aarch64_neon_st4:
15342 case Intrinsic::aarch64_neon_st1x2:
15343 case Intrinsic::aarch64_neon_st1x3:
15344 case Intrinsic::aarch64_neon_st1x4: {
15346 unsigned NumElts = 0;
15347 for (const Value *Arg : I.args()) {
15348 Type *ArgTy = Arg->getType();
15349 if (!ArgTy->isVectorTy())
15350 break;
15351 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
15352 }
15353 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15354 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15355 Info.offset = 0;
15356 Info.align.reset();
15357 // volatile stores with NEON intrinsics not supported
15359 return true;
15360 }
15361 case Intrinsic::aarch64_neon_st2lane:
15362 case Intrinsic::aarch64_neon_st3lane:
15363 case Intrinsic::aarch64_neon_st4lane: {
15365 unsigned NumElts = 0;
15366 // all the vector type is same
15367 Type *VecTy = I.getArgOperand(0)->getType();
15368 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15369
15370 for (const Value *Arg : I.args()) {
15371 Type *ArgTy = Arg->getType();
15372 if (!ArgTy->isVectorTy())
15373 break;
15374 NumElts += 1;
15375 }
15376
15377 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15378 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15379 Info.offset = 0;
15380 Info.align.reset();
15381 // volatile stores with NEON intrinsics not supported
15383 return true;
15384 }
15385 case Intrinsic::aarch64_ldaxr:
15386 case Intrinsic::aarch64_ldxr: {
15387 Type *ValTy = I.getParamElementType(0);
15389 Info.memVT = MVT::getVT(ValTy);
15390 Info.ptrVal = I.getArgOperand(0);
15391 Info.offset = 0;
15392 Info.align = DL.getABITypeAlign(ValTy);
15394 return true;
15395 }
15396 case Intrinsic::aarch64_stlxr:
15397 case Intrinsic::aarch64_stxr: {
15398 Type *ValTy = I.getParamElementType(1);
15400 Info.memVT = MVT::getVT(ValTy);
15401 Info.ptrVal = I.getArgOperand(1);
15402 Info.offset = 0;
15403 Info.align = DL.getABITypeAlign(ValTy);
15405 return true;
15406 }
15407 case Intrinsic::aarch64_ldaxp:
15408 case Intrinsic::aarch64_ldxp:
15410 Info.memVT = MVT::i128;
15411 Info.ptrVal = I.getArgOperand(0);
15412 Info.offset = 0;
15413 Info.align = Align(16);
15415 return true;
15416 case Intrinsic::aarch64_stlxp:
15417 case Intrinsic::aarch64_stxp:
15419 Info.memVT = MVT::i128;
15420 Info.ptrVal = I.getArgOperand(2);
15421 Info.offset = 0;
15422 Info.align = Align(16);
15424 return true;
15425 case Intrinsic::aarch64_sve_ldnt1: {
15426 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15428 Info.memVT = MVT::getVT(I.getType());
15429 Info.ptrVal = I.getArgOperand(1);
15430 Info.offset = 0;
15431 Info.align = DL.getABITypeAlign(ElTy);
15433 return true;
15434 }
15435 case Intrinsic::aarch64_sve_stnt1: {
15436 Type *ElTy =
15437 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15439 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15440 Info.ptrVal = I.getArgOperand(2);
15441 Info.offset = 0;
15442 Info.align = DL.getABITypeAlign(ElTy);
15444 return true;
15445 }
15446 case Intrinsic::aarch64_mops_memset_tag: {
15447 Value *Dst = I.getArgOperand(0);
15448 Value *Val = I.getArgOperand(1);
15450 Info.memVT = MVT::getVT(Val->getType());
15451 Info.ptrVal = Dst;
15452 Info.offset = 0;
15453 Info.align = I.getParamAlign(0).valueOrOne();
15455 // The size of the memory being operated on is unknown at this point
15457 return true;
15458 }
15459 default:
15460 break;
15461 }
15462
15463 return false;
15464}
15465
15467 ISD::LoadExtType ExtTy,
15468 EVT NewVT) const {
15469 // TODO: This may be worth removing. Check regression tests for diffs.
15470 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15471 return false;
15472
15473 // If we're reducing the load width in order to avoid having to use an extra
15474 // instruction to do extension then it's probably a good idea.
15475 if (ExtTy != ISD::NON_EXTLOAD)
15476 return true;
15477 // Don't reduce load width if it would prevent us from combining a shift into
15478 // the offset.
15479 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15480 assert(Mem);
15481 const SDValue &Base = Mem->getBasePtr();
15482 if (Base.getOpcode() == ISD::ADD &&
15483 Base.getOperand(1).getOpcode() == ISD::SHL &&
15484 Base.getOperand(1).hasOneUse() &&
15485 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15486 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15487 if (Mem->getMemoryVT().isScalableVector())
15488 return false;
15489 // The shift can be combined if it matches the size of the value being
15490 // loaded (and so reducing the width would make it not match).
15491 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15492 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15493 if (ShiftAmount == Log2_32(LoadBytes))
15494 return false;
15495 }
15496 // We have no reason to disallow reducing the load width, so allow it.
15497 return true;
15498}
15499
15500// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15502 EVT VT = Extend.getValueType();
15503 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15504 SDValue Extract = Extend.getOperand(0);
15505 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15506 Extract = Extract.getOperand(0);
15507 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15508 EVT VecVT = Extract.getOperand(0).getValueType();
15509 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15510 return false;
15511 }
15512 }
15513 return true;
15514}
15515
15516// Truncations from 64-bit GPR to 32-bit GPR is free.
15518 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15519 return false;
15520 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15521 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15522 return NumBits1 > NumBits2;
15523}
15525 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15526 return false;
15527 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15528 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15529 return NumBits1 > NumBits2;
15530}
15531
15532/// Check if it is profitable to hoist instruction in then/else to if.
15533/// Not profitable if I and it's user can form a FMA instruction
15534/// because we prefer FMSUB/FMADD.
15536 if (I->getOpcode() != Instruction::FMul)
15537 return true;
15538
15539 if (!I->hasOneUse())
15540 return true;
15541
15542 Instruction *User = I->user_back();
15543
15544 if (!(User->getOpcode() == Instruction::FSub ||
15545 User->getOpcode() == Instruction::FAdd))
15546 return true;
15547
15549 const Function *F = I->getFunction();
15550 const DataLayout &DL = F->getDataLayout();
15551 Type *Ty = User->getOperand(0)->getType();
15552
15553 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15555 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15556 Options.UnsafeFPMath));
15557}
15558
15559// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15560// 64-bit GPR.
15562 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15563 return false;
15564 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15565 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15566 return NumBits1 == 32 && NumBits2 == 64;
15567}
15569 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15570 return false;
15571 unsigned NumBits1 = VT1.getSizeInBits();
15572 unsigned NumBits2 = VT2.getSizeInBits();
15573 return NumBits1 == 32 && NumBits2 == 64;
15574}
15575
15577 EVT VT1 = Val.getValueType();
15578 if (isZExtFree(VT1, VT2)) {
15579 return true;
15580 }
15581
15582 if (Val.getOpcode() != ISD::LOAD)
15583 return false;
15584
15585 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15586 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15587 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15588 VT1.getSizeInBits() <= 32);
15589}
15590
15591bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15592 if (isa<FPExtInst>(Ext))
15593 return false;
15594
15595 // Vector types are not free.
15596 if (Ext->getType()->isVectorTy())
15597 return false;
15598
15599 for (const Use &U : Ext->uses()) {
15600 // The extension is free if we can fold it with a left shift in an
15601 // addressing mode or an arithmetic operation: add, sub, and cmp.
15602
15603 // Is there a shift?
15604 const Instruction *Instr = cast<Instruction>(U.getUser());
15605
15606 // Is this a constant shift?
15607 switch (Instr->getOpcode()) {
15608 case Instruction::Shl:
15609 if (!isa<ConstantInt>(Instr->getOperand(1)))
15610 return false;
15611 break;
15612 case Instruction::GetElementPtr: {
15613 gep_type_iterator GTI = gep_type_begin(Instr);
15614 auto &DL = Ext->getDataLayout();
15615 std::advance(GTI, U.getOperandNo()-1);
15616 Type *IdxTy = GTI.getIndexedType();
15617 // This extension will end up with a shift because of the scaling factor.
15618 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15619 // Get the shift amount based on the scaling factor:
15620 // log2(sizeof(IdxTy)) - log2(8).
15621 if (IdxTy->isScalableTy())
15622 return false;
15623 uint64_t ShiftAmt =
15624 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15625 3;
15626 // Is the constant foldable in the shift of the addressing mode?
15627 // I.e., shift amount is between 1 and 4 inclusive.
15628 if (ShiftAmt == 0 || ShiftAmt > 4)
15629 return false;
15630 break;
15631 }
15632 case Instruction::Trunc:
15633 // Check if this is a noop.
15634 // trunc(sext ty1 to ty2) to ty1.
15635 if (Instr->getType() == Ext->getOperand(0)->getType())
15636 continue;
15637 [[fallthrough]];
15638 default:
15639 return false;
15640 }
15641
15642 // At this point we can use the bfm family, so this extension is free
15643 // for that use.
15644 }
15645 return true;
15646}
15647
15648static bool isSplatShuffle(Value *V) {
15649 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15650 return all_equal(Shuf->getShuffleMask());
15651 return false;
15652}
15653
15654/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15655/// or upper half of the vector elements.
15656static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15657 bool AllowSplat = false) {
15658 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15659 auto *FullTy = FullV->getType();
15660 auto *HalfTy = HalfV->getType();
15661 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15662 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15663 };
15664
15665 auto extractHalf = [](Value *FullV, Value *HalfV) {
15666 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15667 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15668 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15669 };
15670
15671 ArrayRef<int> M1, M2;
15672 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15673 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15674 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15675 return false;
15676
15677 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15678 // it is not checked as an extract below.
15679 if (AllowSplat && isSplatShuffle(Op1))
15680 S1Op1 = nullptr;
15681 if (AllowSplat && isSplatShuffle(Op2))
15682 S2Op1 = nullptr;
15683
15684 // Check that the operands are half as wide as the result and we extract
15685 // half of the elements of the input vectors.
15686 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15687 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15688 return false;
15689
15690 // Check the mask extracts either the lower or upper half of vector
15691 // elements.
15692 int M1Start = 0;
15693 int M2Start = 0;
15694 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15695 if ((S1Op1 &&
15696 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15697 (S2Op1 &&
15698 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15699 return false;
15700
15701 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15702 (M2Start != 0 && M2Start != (NumElements / 2)))
15703 return false;
15704 if (S1Op1 && S2Op1 && M1Start != M2Start)
15705 return false;
15706
15707 return true;
15708}
15709
15710/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15711/// of the vector elements.
15712static bool areExtractExts(Value *Ext1, Value *Ext2) {
15713 auto areExtDoubled = [](Instruction *Ext) {
15714 return Ext->getType()->getScalarSizeInBits() ==
15715 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15716 };
15717
15718 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15719 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15720 !areExtDoubled(cast<Instruction>(Ext1)) ||
15721 !areExtDoubled(cast<Instruction>(Ext2)))
15722 return false;
15723
15724 return true;
15725}
15726
15727/// Check if Op could be used with vmull_high_p64 intrinsic.
15729 Value *VectorOperand = nullptr;
15730 ConstantInt *ElementIndex = nullptr;
15731 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15732 m_ConstantInt(ElementIndex))) &&
15733 ElementIndex->getValue() == 1 &&
15734 isa<FixedVectorType>(VectorOperand->getType()) &&
15735 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15736}
15737
15738/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15739static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15741}
15742
15744 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15745 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15746 if (!GEP || GEP->getNumOperands() != 2)
15747 return false;
15748
15749 Value *Base = GEP->getOperand(0);
15750 Value *Offsets = GEP->getOperand(1);
15751
15752 // We only care about scalar_base+vector_offsets.
15753 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15754 return false;
15755
15756 // Sink extends that would allow us to use 32-bit offset vectors.
15757 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15758 auto *OffsetsInst = cast<Instruction>(Offsets);
15759 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15760 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15761 Ops.push_back(&GEP->getOperandUse(1));
15762 }
15763
15764 // Sink the GEP.
15765 return true;
15766}
15767
15768/// We want to sink following cases:
15769/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
15770/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
15772 if (match(Op, m_VScale()))
15773 return true;
15774 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15776 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15777 return true;
15778 }
15779 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
15781 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
15782 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
15783 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15784 return true;
15785 }
15786 return false;
15787}
15788
15789/// Check if sinking \p I's operands to I's basic block is profitable, because
15790/// the operands can be folded into a target instruction, e.g.
15791/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15793 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15794 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15795 switch (II->getIntrinsicID()) {
15796 case Intrinsic::aarch64_neon_smull:
15797 case Intrinsic::aarch64_neon_umull:
15798 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15799 /*AllowSplat=*/true)) {
15800 Ops.push_back(&II->getOperandUse(0));
15801 Ops.push_back(&II->getOperandUse(1));
15802 return true;
15803 }
15804 [[fallthrough]];
15805
15806 case Intrinsic::fma:
15807 if (isa<VectorType>(I->getType()) &&
15808 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15809 !Subtarget->hasFullFP16())
15810 return false;
15811 [[fallthrough]];
15812 case Intrinsic::aarch64_neon_sqdmull:
15813 case Intrinsic::aarch64_neon_sqdmulh:
15814 case Intrinsic::aarch64_neon_sqrdmulh:
15815 // Sink splats for index lane variants
15816 if (isSplatShuffle(II->getOperand(0)))
15817 Ops.push_back(&II->getOperandUse(0));
15818 if (isSplatShuffle(II->getOperand(1)))
15819 Ops.push_back(&II->getOperandUse(1));
15820 return !Ops.empty();
15821 case Intrinsic::aarch64_neon_fmlal:
15822 case Intrinsic::aarch64_neon_fmlal2:
15823 case Intrinsic::aarch64_neon_fmlsl:
15824 case Intrinsic::aarch64_neon_fmlsl2:
15825 // Sink splats for index lane variants
15826 if (isSplatShuffle(II->getOperand(1)))
15827 Ops.push_back(&II->getOperandUse(1));
15828 if (isSplatShuffle(II->getOperand(2)))
15829 Ops.push_back(&II->getOperandUse(2));
15830 return !Ops.empty();
15831 case Intrinsic::aarch64_sve_ptest_first:
15832 case Intrinsic::aarch64_sve_ptest_last:
15833 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15834 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15835 Ops.push_back(&II->getOperandUse(0));
15836 return !Ops.empty();
15837 case Intrinsic::aarch64_sme_write_horiz:
15838 case Intrinsic::aarch64_sme_write_vert:
15839 case Intrinsic::aarch64_sme_writeq_horiz:
15840 case Intrinsic::aarch64_sme_writeq_vert: {
15841 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15842 if (!Idx || Idx->getOpcode() != Instruction::Add)
15843 return false;
15844 Ops.push_back(&II->getOperandUse(1));
15845 return true;
15846 }
15847 case Intrinsic::aarch64_sme_read_horiz:
15848 case Intrinsic::aarch64_sme_read_vert:
15849 case Intrinsic::aarch64_sme_readq_horiz:
15850 case Intrinsic::aarch64_sme_readq_vert:
15851 case Intrinsic::aarch64_sme_ld1b_vert:
15852 case Intrinsic::aarch64_sme_ld1h_vert:
15853 case Intrinsic::aarch64_sme_ld1w_vert:
15854 case Intrinsic::aarch64_sme_ld1d_vert:
15855 case Intrinsic::aarch64_sme_ld1q_vert:
15856 case Intrinsic::aarch64_sme_st1b_vert:
15857 case Intrinsic::aarch64_sme_st1h_vert:
15858 case Intrinsic::aarch64_sme_st1w_vert:
15859 case Intrinsic::aarch64_sme_st1d_vert:
15860 case Intrinsic::aarch64_sme_st1q_vert:
15861 case Intrinsic::aarch64_sme_ld1b_horiz:
15862 case Intrinsic::aarch64_sme_ld1h_horiz:
15863 case Intrinsic::aarch64_sme_ld1w_horiz:
15864 case Intrinsic::aarch64_sme_ld1d_horiz:
15865 case Intrinsic::aarch64_sme_ld1q_horiz:
15866 case Intrinsic::aarch64_sme_st1b_horiz:
15867 case Intrinsic::aarch64_sme_st1h_horiz:
15868 case Intrinsic::aarch64_sme_st1w_horiz:
15869 case Intrinsic::aarch64_sme_st1d_horiz:
15870 case Intrinsic::aarch64_sme_st1q_horiz: {
15871 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15872 if (!Idx || Idx->getOpcode() != Instruction::Add)
15873 return false;
15874 Ops.push_back(&II->getOperandUse(3));
15875 return true;
15876 }
15877 case Intrinsic::aarch64_neon_pmull:
15878 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15879 return false;
15880 Ops.push_back(&II->getOperandUse(0));
15881 Ops.push_back(&II->getOperandUse(1));
15882 return true;
15883 case Intrinsic::aarch64_neon_pmull64:
15884 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15885 II->getArgOperand(1)))
15886 return false;
15887 Ops.push_back(&II->getArgOperandUse(0));
15888 Ops.push_back(&II->getArgOperandUse(1));
15889 return true;
15890 case Intrinsic::masked_gather:
15891 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15892 return false;
15893 Ops.push_back(&II->getArgOperandUse(0));
15894 return true;
15895 case Intrinsic::masked_scatter:
15896 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15897 return false;
15898 Ops.push_back(&II->getArgOperandUse(1));
15899 return true;
15900 default:
15901 return false;
15902 }
15903 }
15904
15905 // Sink vscales closer to uses for better isel
15906 switch (I->getOpcode()) {
15907 case Instruction::GetElementPtr:
15908 case Instruction::Add:
15909 case Instruction::Sub:
15910 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15911 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15912 Ops.push_back(&I->getOperandUse(Op));
15913 return true;
15914 }
15915 }
15916 break;
15917 default:
15918 break;
15919 }
15920
15921 if (!I->getType()->isVectorTy())
15922 return false;
15923
15924 switch (I->getOpcode()) {
15925 case Instruction::Sub:
15926 case Instruction::Add: {
15927 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15928 return false;
15929
15930 // If the exts' operands extract either the lower or upper elements, we
15931 // can sink them too.
15932 auto Ext1 = cast<Instruction>(I->getOperand(0));
15933 auto Ext2 = cast<Instruction>(I->getOperand(1));
15934 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15935 Ops.push_back(&Ext1->getOperandUse(0));
15936 Ops.push_back(&Ext2->getOperandUse(0));
15937 }
15938
15939 Ops.push_back(&I->getOperandUse(0));
15940 Ops.push_back(&I->getOperandUse(1));
15941
15942 return true;
15943 }
15944 case Instruction::Or: {
15945 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15946 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15947 if (Subtarget->hasNEON()) {
15948 Instruction *OtherAnd, *IA, *IB;
15949 Value *MaskValue;
15950 // MainAnd refers to And instruction that has 'Not' as one of its operands
15951 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15952 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15953 m_Instruction(IA)))))) {
15954 if (match(OtherAnd,
15955 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15956 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15957 ? cast<Instruction>(I->getOperand(1))
15958 : cast<Instruction>(I->getOperand(0));
15959
15960 // Both Ands should be in same basic block as Or
15961 if (I->getParent() != MainAnd->getParent() ||
15962 I->getParent() != OtherAnd->getParent())
15963 return false;
15964
15965 // Non-mask operands of both Ands should also be in same basic block
15966 if (I->getParent() != IA->getParent() ||
15967 I->getParent() != IB->getParent())
15968 return false;
15969
15970 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15971 Ops.push_back(&I->getOperandUse(0));
15972 Ops.push_back(&I->getOperandUse(1));
15973
15974 return true;
15975 }
15976 }
15977 }
15978
15979 return false;
15980 }
15981 case Instruction::Mul: {
15982 int NumZExts = 0, NumSExts = 0;
15983 for (auto &Op : I->operands()) {
15984 // Make sure we are not already sinking this operand
15985 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15986 continue;
15987
15988 if (match(&Op, m_SExt(m_Value()))) {
15989 NumSExts++;
15990 continue;
15991 } else if (match(&Op, m_ZExt(m_Value()))) {
15992 NumZExts++;
15993 continue;
15994 }
15995
15996 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15997
15998 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15999 // operand and the s/zext can help create indexed s/umull. This is
16000 // especially useful to prevent i64 mul being scalarized.
16001 if (Shuffle && isSplatShuffle(Shuffle) &&
16002 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
16003 Ops.push_back(&Shuffle->getOperandUse(0));
16004 Ops.push_back(&Op);
16005 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
16006 NumSExts++;
16007 else
16008 NumZExts++;
16009 continue;
16010 }
16011
16012 if (!Shuffle)
16013 continue;
16014
16015 Value *ShuffleOperand = Shuffle->getOperand(0);
16016 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
16017 if (!Insert)
16018 continue;
16019
16020 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
16021 if (!OperandInstr)
16022 continue;
16023
16024 ConstantInt *ElementConstant =
16025 dyn_cast<ConstantInt>(Insert->getOperand(2));
16026 // Check that the insertelement is inserting into element 0
16027 if (!ElementConstant || !ElementConstant->isZero())
16028 continue;
16029
16030 unsigned Opcode = OperandInstr->getOpcode();
16031 if (Opcode == Instruction::SExt)
16032 NumSExts++;
16033 else if (Opcode == Instruction::ZExt)
16034 NumZExts++;
16035 else {
16036 // If we find that the top bits are known 0, then we can sink and allow
16037 // the backend to generate a umull.
16038 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
16039 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
16040 const DataLayout &DL = I->getDataLayout();
16041 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
16042 continue;
16043 NumZExts++;
16044 }
16045
16046 Ops.push_back(&Shuffle->getOperandUse(0));
16047 Ops.push_back(&Op);
16048 }
16049
16050 // Is it profitable to sink if we found two of the same type of extends.
16051 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
16052 }
16053 default:
16054 return false;
16055 }
16056 return false;
16057}
16058
16059static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16060 unsigned NumElts, bool IsLittleEndian,
16061 SmallVectorImpl<int> &Mask) {
16062 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
16063 return false;
16064
16065 assert(DstWidth % SrcWidth == 0 &&
16066 "TBL lowering is not supported for a conversion instruction with this "
16067 "source and destination element type.");
16068
16069 unsigned Factor = DstWidth / SrcWidth;
16070 unsigned MaskLen = NumElts * Factor;
16071
16072 Mask.clear();
16073 Mask.resize(MaskLen, NumElts);
16074
16075 unsigned SrcIndex = 0;
16076 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16077 Mask[I] = SrcIndex++;
16078
16079 return true;
16080}
16081
16083 FixedVectorType *ZExtTy,
16084 FixedVectorType *DstTy,
16085 bool IsLittleEndian) {
16086 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16087 unsigned NumElts = SrcTy->getNumElements();
16088 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16089 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16090
16091 SmallVector<int> Mask;
16092 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16093 return nullptr;
16094
16095 auto *FirstEltZero = Builder.CreateInsertElement(
16096 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
16097 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16098 Result = Builder.CreateBitCast(Result, DstTy);
16099 if (DstTy != ZExtTy)
16100 Result = Builder.CreateZExt(Result, ZExtTy);
16101 return Result;
16102}
16103
16104static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16105 IRBuilder<> Builder(TI);
16107 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16108 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16109 auto *DstTy = cast<FixedVectorType>(TI->getType());
16110 assert(SrcTy->getElementType()->isIntegerTy() &&
16111 "Non-integer type source vector element is not supported");
16112 assert(DstTy->getElementType()->isIntegerTy(8) &&
16113 "Unsupported destination vector element type");
16114 unsigned SrcElemTySz =
16115 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16116 unsigned DstElemTySz =
16117 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16118 assert((SrcElemTySz % DstElemTySz == 0) &&
16119 "Cannot lower truncate to tbl instructions for a source element size "
16120 "that is not divisible by the destination element size");
16121 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16122 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16123 "Unsupported source vector element type size");
16124 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16125
16126 // Create a mask to choose every nth byte from the source vector table of
16127 // bytes to create the truncated destination vector, where 'n' is the truncate
16128 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16129 // 0,8,16,..Y*8th bytes for the little-endian format
16131 for (int Itr = 0; Itr < 16; Itr++) {
16132 if (Itr < NumElements)
16133 MaskConst.push_back(Builder.getInt8(
16134 IsLittleEndian ? Itr * TruncFactor
16135 : Itr * TruncFactor + (TruncFactor - 1)));
16136 else
16137 MaskConst.push_back(Builder.getInt8(255));
16138 }
16139
16140 int MaxTblSz = 128 * 4;
16141 int MaxSrcSz = SrcElemTySz * NumElements;
16142 int ElemsPerTbl =
16143 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16144 assert(ElemsPerTbl <= 16 &&
16145 "Maximum elements selected using TBL instruction cannot exceed 16!");
16146
16147 int ShuffleCount = 128 / SrcElemTySz;
16148 SmallVector<int> ShuffleLanes;
16149 for (int i = 0; i < ShuffleCount; ++i)
16150 ShuffleLanes.push_back(i);
16151
16152 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16153 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16154 // call TBL & save the result in a vector of TBL results for combining later.
16156 while (ShuffleLanes.back() < NumElements) {
16157 Parts.push_back(Builder.CreateBitCast(
16158 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16159
16160 if (Parts.size() == 4) {
16162 Intrinsic::aarch64_neon_tbl4, VecTy);
16163 Parts.push_back(ConstantVector::get(MaskConst));
16164 Results.push_back(Builder.CreateCall(F, Parts));
16165 Parts.clear();
16166 }
16167
16168 for (int i = 0; i < ShuffleCount; ++i)
16169 ShuffleLanes[i] += ShuffleCount;
16170 }
16171
16172 assert((Parts.empty() || Results.empty()) &&
16173 "Lowering trunc for vectors requiring different TBL instructions is "
16174 "not supported!");
16175 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16176 // registers
16177 if (!Parts.empty()) {
16178 Intrinsic::ID TblID;
16179 switch (Parts.size()) {
16180 case 1:
16181 TblID = Intrinsic::aarch64_neon_tbl1;
16182 break;
16183 case 2:
16184 TblID = Intrinsic::aarch64_neon_tbl2;
16185 break;
16186 case 3:
16187 TblID = Intrinsic::aarch64_neon_tbl3;
16188 break;
16189 }
16190
16191 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
16192 Parts.push_back(ConstantVector::get(MaskConst));
16193 Results.push_back(Builder.CreateCall(F, Parts));
16194 }
16195
16196 // Extract the destination vector from TBL result(s) after combining them
16197 // where applicable. Currently, at most two TBLs are supported.
16198 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16199 "more than 2 tbl instructions!");
16200 Value *FinalResult = Results[0];
16201 if (Results.size() == 1) {
16202 if (ElemsPerTbl < 16) {
16203 SmallVector<int> FinalMask(ElemsPerTbl);
16204 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16205 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16206 }
16207 } else {
16208 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16209 if (ElemsPerTbl < 16) {
16210 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16211 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16212 } else {
16213 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16214 }
16215 FinalResult =
16216 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16217 }
16218
16219 TI->replaceAllUsesWith(FinalResult);
16220 TI->eraseFromParent();
16221}
16222
16224 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16225 // shuffle_vector instructions are serialized when targeting SVE,
16226 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16227 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16228 return false;
16229
16230 // Try to optimize conversions using tbl. This requires materializing constant
16231 // index vectors, which can increase code size and add loads. Skip the
16232 // transform unless the conversion is in a loop block guaranteed to execute
16233 // and we are not optimizing for size.
16234 Function *F = I->getParent()->getParent();
16235 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16236 F->hasOptSize())
16237 return false;
16238
16239 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16240 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16241 if (!SrcTy || !DstTy)
16242 return false;
16243
16244 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16245 // lowered to tbl instructions to insert the original i8 elements
16246 // into i8x lanes. This is enabled for cases where it is beneficial.
16247 auto *ZExt = dyn_cast<ZExtInst>(I);
16248 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16249 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16250 if (DstWidth % 8 != 0)
16251 return false;
16252
16253 auto *TruncDstType =
16254 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16255 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16256 // the remaining ZExt folded into the user, don't use tbl lowering.
16257 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16258 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16261 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16262 return false;
16263
16264 DstTy = TruncDstType;
16265 }
16266 IRBuilder<> Builder(ZExt);
16268 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16269 DstTy, Subtarget->isLittleEndian());
16270 if (!Result)
16271 return false;
16272 ZExt->replaceAllUsesWith(Result);
16273 ZExt->eraseFromParent();
16274 return true;
16275 }
16276
16277 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16278 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16279 DstTy->getElementType()->isFloatTy()) {
16280 IRBuilder<> Builder(I);
16282 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16283 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16284 if (!ZExt)
16285 return false;
16286 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16287 I->replaceAllUsesWith(UI);
16288 I->eraseFromParent();
16289 return true;
16290 }
16291
16292 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16293 // followed by a truncate lowered to using tbl.4.
16294 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16295 if (FPToUI &&
16296 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16297 SrcTy->getElementType()->isFloatTy() &&
16298 DstTy->getElementType()->isIntegerTy(8)) {
16299 IRBuilder<> Builder(I);
16300 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16301 VectorType::getInteger(SrcTy));
16302 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16303 I->replaceAllUsesWith(TruncI);
16304 I->eraseFromParent();
16305 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16306 return true;
16307 }
16308
16309 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16310 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16311 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16312 // registers
16313 auto *TI = dyn_cast<TruncInst>(I);
16314 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16315 ((SrcTy->getElementType()->isIntegerTy(32) ||
16316 SrcTy->getElementType()->isIntegerTy(64)) &&
16317 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16318 createTblForTrunc(TI, Subtarget->isLittleEndian());
16319 return true;
16320 }
16321
16322 return false;
16323}
16324
16326 Align &RequiredAligment) const {
16327 if (!LoadedType.isSimple() ||
16328 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16329 return false;
16330 // Cyclone supports unaligned accesses.
16331 RequiredAligment = Align(1);
16332 unsigned NumBits = LoadedType.getSizeInBits();
16333 return NumBits == 32 || NumBits == 64;
16334}
16335
16336/// A helper function for determining the number of interleaved accesses we
16337/// will generate when lowering accesses of the given type.
16339 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16340 unsigned VecSize = 128;
16341 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16342 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16343 if (UseScalable && isa<FixedVectorType>(VecTy))
16344 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16345 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
16346}
16347
16350 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16351 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16352 return MOStridedAccess;
16354}
16355
16357 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
16358 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16359 auto EC = VecTy->getElementCount();
16360 unsigned MinElts = EC.getKnownMinValue();
16361
16362 UseScalable = false;
16363
16364 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
16365 (!Subtarget->useSVEForFixedLengthVectors() ||
16367 return false;
16368
16369 if (isa<ScalableVectorType>(VecTy) &&
16370 !Subtarget->isSVEorStreamingSVEAvailable())
16371 return false;
16372
16373 // Ensure the number of vector elements is greater than 1.
16374 if (MinElts < 2)
16375 return false;
16376
16377 // Ensure the element type is legal.
16378 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
16379 return false;
16380
16381 if (EC.isScalable()) {
16382 UseScalable = true;
16383 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16384 }
16385
16386 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16387 if (Subtarget->useSVEForFixedLengthVectors()) {
16388 unsigned MinSVEVectorSize =
16389 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16390 if (VecSize % MinSVEVectorSize == 0 ||
16391 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16392 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16393 UseScalable = true;
16394 return true;
16395 }
16396 }
16397
16398 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16399 // 128 will be split into multiple interleaved accesses.
16400 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16401}
16402
16404 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16405 return ScalableVectorType::get(VTy->getElementType(), 2);
16406
16407 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16408 return ScalableVectorType::get(VTy->getElementType(), 4);
16409
16410 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16411 return ScalableVectorType::get(VTy->getElementType(), 8);
16412
16413 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16414 return ScalableVectorType::get(VTy->getElementType(), 8);
16415
16416 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16417 return ScalableVectorType::get(VTy->getElementType(), 2);
16418
16419 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16420 return ScalableVectorType::get(VTy->getElementType(), 4);
16421
16422 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16423 return ScalableVectorType::get(VTy->getElementType(), 8);
16424
16425 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16426 return ScalableVectorType::get(VTy->getElementType(), 16);
16427
16428 llvm_unreachable("Cannot handle input vector type");
16429}
16430
16431static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16432 bool Scalable, Type *LDVTy,
16433 Type *PtrTy) {
16434 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16435 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16436 Intrinsic::aarch64_sve_ld3_sret,
16437 Intrinsic::aarch64_sve_ld4_sret};
16438 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16439 Intrinsic::aarch64_neon_ld3,
16440 Intrinsic::aarch64_neon_ld4};
16441 if (Scalable)
16442 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16443
16444 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16445}
16446
16447static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16448 bool Scalable, Type *STVTy,
16449 Type *PtrTy) {
16450 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16451 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16452 Intrinsic::aarch64_sve_st3,
16453 Intrinsic::aarch64_sve_st4};
16454 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16455 Intrinsic::aarch64_neon_st3,
16456 Intrinsic::aarch64_neon_st4};
16457 if (Scalable)
16458 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16459
16460 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16461}
16462
16463/// Lower an interleaved load into a ldN intrinsic.
16464///
16465/// E.g. Lower an interleaved load (Factor = 2):
16466/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16467/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16468/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16469///
16470/// Into:
16471/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16472/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16473/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16476 ArrayRef<unsigned> Indices, unsigned Factor) const {
16477 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16478 "Invalid interleave factor");
16479 assert(!Shuffles.empty() && "Empty shufflevector input");
16480 assert(Shuffles.size() == Indices.size() &&
16481 "Unmatched number of shufflevectors and indices");
16482
16483 const DataLayout &DL = LI->getDataLayout();
16484
16485 VectorType *VTy = Shuffles[0]->getType();
16486
16487 // Skip if we do not have NEON and skip illegal vector types. We can
16488 // "legalize" wide vector types into multiple interleaved accesses as long as
16489 // the vector types are divisible by 128.
16490 bool UseScalable;
16491 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16492 return false;
16493
16494 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16495
16496 auto *FVTy = cast<FixedVectorType>(VTy);
16497
16498 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16499 // load integer vectors first and then convert to pointer vectors.
16500 Type *EltTy = FVTy->getElementType();
16501 if (EltTy->isPointerTy())
16502 FVTy =
16503 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16504
16505 // If we're going to generate more than one load, reset the sub-vector type
16506 // to something legal.
16507 FVTy = FixedVectorType::get(FVTy->getElementType(),
16508 FVTy->getNumElements() / NumLoads);
16509
16510 auto *LDVTy =
16511 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16512
16513 IRBuilder<> Builder(LI);
16514
16515 // The base address of the load.
16516 Value *BaseAddr = LI->getPointerOperand();
16517
16518 Type *PtrTy = LI->getPointerOperandType();
16519 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16520 LDVTy->getElementCount());
16521
16522 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16523 UseScalable, LDVTy, PtrTy);
16524
16525 // Holds sub-vectors extracted from the load intrinsic return values. The
16526 // sub-vectors are associated with the shufflevector instructions they will
16527 // replace.
16529
16530 Value *PTrue = nullptr;
16531 if (UseScalable) {
16532 std::optional<unsigned> PgPattern =
16533 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16534 if (Subtarget->getMinSVEVectorSizeInBits() ==
16535 Subtarget->getMaxSVEVectorSizeInBits() &&
16536 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16537 PgPattern = AArch64SVEPredPattern::all;
16538
16539 auto *PTruePat =
16540 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16541 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16542 {PTruePat});
16543 }
16544
16545 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16546
16547 // If we're generating more than one load, compute the base address of
16548 // subsequent loads as an offset from the previous.
16549 if (LoadCount > 0)
16550 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16551 FVTy->getNumElements() * Factor);
16552
16553 CallInst *LdN;
16554 if (UseScalable)
16555 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16556 else
16557 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16558
16559 // Extract and store the sub-vectors returned by the load intrinsic.
16560 for (unsigned i = 0; i < Shuffles.size(); i++) {
16561 ShuffleVectorInst *SVI = Shuffles[i];
16562 unsigned Index = Indices[i];
16563
16564 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16565
16566 if (UseScalable)
16567 SubVec = Builder.CreateExtractVector(
16568 FVTy, SubVec,
16569 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16570
16571 // Convert the integer vector to pointer vector if the element is pointer.
16572 if (EltTy->isPointerTy())
16573 SubVec = Builder.CreateIntToPtr(
16575 FVTy->getNumElements()));
16576
16577 SubVecs[SVI].push_back(SubVec);
16578 }
16579 }
16580
16581 // Replace uses of the shufflevector instructions with the sub-vectors
16582 // returned by the load intrinsic. If a shufflevector instruction is
16583 // associated with more than one sub-vector, those sub-vectors will be
16584 // concatenated into a single wide vector.
16585 for (ShuffleVectorInst *SVI : Shuffles) {
16586 auto &SubVec = SubVecs[SVI];
16587 auto *WideVec =
16588 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16589 SVI->replaceAllUsesWith(WideVec);
16590 }
16591
16592 return true;
16593}
16594
16595template <typename Iter>
16596bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16597 int MaxLookupDist = 20;
16598 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16599 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16600 const Value *PtrA1 =
16601 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16602
16603 while (++It != End) {
16604 if (It->isDebugOrPseudoInst())
16605 continue;
16606 if (MaxLookupDist-- == 0)
16607 break;
16608 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16609 const Value *PtrB1 =
16610 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16611 DL, OffsetB);
16612 if (PtrA1 == PtrB1 &&
16613 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16614 .abs() == 16)
16615 return true;
16616 }
16617 }
16618
16619 return false;
16620}
16621
16622/// Lower an interleaved store into a stN intrinsic.
16623///
16624/// E.g. Lower an interleaved store (Factor = 3):
16625/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16626/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16627/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16628///
16629/// Into:
16630/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16631/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16632/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16633/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16634///
16635/// Note that the new shufflevectors will be removed and we'll only generate one
16636/// st3 instruction in CodeGen.
16637///
16638/// Example for a more general valid mask (Factor 3). Lower:
16639/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16640/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16641/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16642///
16643/// Into:
16644/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16645/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16646/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16647/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16649 ShuffleVectorInst *SVI,
16650 unsigned Factor) const {
16651
16652 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16653 "Invalid interleave factor");
16654
16655 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16656 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16657
16658 unsigned LaneLen = VecTy->getNumElements() / Factor;
16659 Type *EltTy = VecTy->getElementType();
16660 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16661
16662 const DataLayout &DL = SI->getDataLayout();
16663 bool UseScalable;
16664
16665 // Skip if we do not have NEON and skip illegal vector types. We can
16666 // "legalize" wide vector types into multiple interleaved accesses as long as
16667 // the vector types are divisible by 128.
16668 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16669 return false;
16670
16671 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16672
16673 Value *Op0 = SVI->getOperand(0);
16674 Value *Op1 = SVI->getOperand(1);
16675 IRBuilder<> Builder(SI);
16676
16677 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16678 // vectors to integer vectors.
16679 if (EltTy->isPointerTy()) {
16680 Type *IntTy = DL.getIntPtrType(EltTy);
16681 unsigned NumOpElts =
16682 cast<FixedVectorType>(Op0->getType())->getNumElements();
16683
16684 // Convert to the corresponding integer vector.
16685 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16686 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16687 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16688
16689 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16690 }
16691
16692 // If we're going to generate more than one store, reset the lane length
16693 // and sub-vector type to something legal.
16694 LaneLen /= NumStores;
16695 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16696
16697 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16698 : SubVecTy;
16699
16700 // The base address of the store.
16701 Value *BaseAddr = SI->getPointerOperand();
16702
16703 auto Mask = SVI->getShuffleMask();
16704
16705 // Sanity check if all the indices are NOT in range.
16706 // If mask is `poison`, `Mask` may be a vector of -1s.
16707 // If all of them are `poison`, OOB read will happen later.
16708 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16709 return false;
16710 }
16711 // A 64bit st2 which does not start at element 0 will involved adding extra
16712 // ext elements making the st2 unprofitable, and if there is a nearby store
16713 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16714 // zip;ldp pair which has higher throughput.
16715 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16716 (Mask[0] != 0 ||
16717 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16718 DL) ||
16719 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16720 BaseAddr, DL)))
16721 return false;
16722
16723 Type *PtrTy = SI->getPointerOperandType();
16724 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16725 STVTy->getElementCount());
16726
16727 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16728 UseScalable, STVTy, PtrTy);
16729
16730 Value *PTrue = nullptr;
16731 if (UseScalable) {
16732 std::optional<unsigned> PgPattern =
16733 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16734 if (Subtarget->getMinSVEVectorSizeInBits() ==
16735 Subtarget->getMaxSVEVectorSizeInBits() &&
16736 Subtarget->getMinSVEVectorSizeInBits() ==
16737 DL.getTypeSizeInBits(SubVecTy))
16738 PgPattern = AArch64SVEPredPattern::all;
16739
16740 auto *PTruePat =
16741 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16742 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16743 {PTruePat});
16744 }
16745
16746 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16747
16749
16750 // Split the shufflevector operands into sub vectors for the new stN call.
16751 for (unsigned i = 0; i < Factor; i++) {
16752 Value *Shuffle;
16753 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16754 if (Mask[IdxI] >= 0) {
16755 Shuffle = Builder.CreateShuffleVector(
16756 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16757 } else {
16758 unsigned StartMask = 0;
16759 for (unsigned j = 1; j < LaneLen; j++) {
16760 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16761 if (Mask[IdxJ] >= 0) {
16762 StartMask = Mask[IdxJ] - j;
16763 break;
16764 }
16765 }
16766 // Note: Filling undef gaps with random elements is ok, since
16767 // those elements were being written anyway (with undefs).
16768 // In the case of all undefs we're defaulting to using elems from 0
16769 // Note: StartMask cannot be negative, it's checked in
16770 // isReInterleaveMask
16771 Shuffle = Builder.CreateShuffleVector(
16772 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16773 }
16774
16775 if (UseScalable)
16776 Shuffle = Builder.CreateInsertVector(
16777 STVTy, UndefValue::get(STVTy), Shuffle,
16778 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16779
16780 Ops.push_back(Shuffle);
16781 }
16782
16783 if (UseScalable)
16784 Ops.push_back(PTrue);
16785
16786 // If we generating more than one store, we compute the base address of
16787 // subsequent stores as an offset from the previous.
16788 if (StoreCount > 0)
16789 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16790 BaseAddr, LaneLen * Factor);
16791
16792 Ops.push_back(BaseAddr);
16793 Builder.CreateCall(StNFunc, Ops);
16794 }
16795 return true;
16796}
16797
16799 IntrinsicInst *DI, LoadInst *LI) const {
16800 // Only deinterleave2 supported at present.
16801 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16802 return false;
16803
16804 // Only a factor of 2 supported at present.
16805 const unsigned Factor = 2;
16806
16807 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16808 const DataLayout &DL = DI->getDataLayout();
16809 bool UseScalable;
16810 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16811 return false;
16812
16813 // TODO: Add support for using SVE instructions with fixed types later, using
16814 // the code from lowerInterleavedLoad to obtain the correct container type.
16815 if (UseScalable && !VTy->isScalableTy())
16816 return false;
16817
16818 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16819
16820 VectorType *LdTy =
16822 VTy->getElementCount().divideCoefficientBy(NumLoads));
16823
16824 Type *PtrTy = LI->getPointerOperandType();
16825 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16826 UseScalable, LdTy, PtrTy);
16827
16828 IRBuilder<> Builder(LI);
16829
16830 Value *Pred = nullptr;
16831 if (UseScalable)
16832 Pred =
16833 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16834
16835 Value *BaseAddr = LI->getPointerOperand();
16836 Value *Result;
16837 if (NumLoads > 1) {
16838 Value *Left = PoisonValue::get(VTy);
16840
16841 for (unsigned I = 0; I < NumLoads; ++I) {
16842 Value *Offset = Builder.getInt64(I * Factor);
16843
16844 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16845 Value *LdN = nullptr;
16846 if (UseScalable)
16847 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16848 else
16849 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16850
16851 Value *Idx =
16852 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16853 Left = Builder.CreateInsertVector(
16854 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16855 Right = Builder.CreateInsertVector(
16856 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16857 }
16858
16859 Result = PoisonValue::get(DI->getType());
16860 Result = Builder.CreateInsertValue(Result, Left, 0);
16861 Result = Builder.CreateInsertValue(Result, Right, 1);
16862 } else {
16863 if (UseScalable)
16864 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16865 else
16866 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16867 }
16868
16869 DI->replaceAllUsesWith(Result);
16870 return true;
16871}
16872
16874 IntrinsicInst *II, StoreInst *SI) const {
16875 // Only interleave2 supported at present.
16876 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16877 return false;
16878
16879 // Only a factor of 2 supported at present.
16880 const unsigned Factor = 2;
16881
16882 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16883 const DataLayout &DL = II->getDataLayout();
16884 bool UseScalable;
16885 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16886 return false;
16887
16888 // TODO: Add support for using SVE instructions with fixed types later, using
16889 // the code from lowerInterleavedStore to obtain the correct container type.
16890 if (UseScalable && !VTy->isScalableTy())
16891 return false;
16892
16893 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16894
16895 VectorType *StTy =
16897 VTy->getElementCount().divideCoefficientBy(NumStores));
16898
16899 Type *PtrTy = SI->getPointerOperandType();
16900 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16901 UseScalable, StTy, PtrTy);
16902
16903 IRBuilder<> Builder(SI);
16904
16905 Value *BaseAddr = SI->getPointerOperand();
16906 Value *Pred = nullptr;
16907
16908 if (UseScalable)
16909 Pred =
16910 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16911
16912 Value *L = II->getOperand(0);
16913 Value *R = II->getOperand(1);
16914
16915 for (unsigned I = 0; I < NumStores; ++I) {
16916 Value *Address = BaseAddr;
16917 if (NumStores > 1) {
16918 Value *Offset = Builder.getInt64(I * Factor);
16919 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16920
16921 Value *Idx =
16922 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16923 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16924 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16925 }
16926
16927 if (UseScalable)
16928 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16929 else
16930 Builder.CreateCall(StNFunc, {L, R, Address});
16931 }
16932
16933 return true;
16934}
16935
16937 const MemOp &Op, const AttributeList &FuncAttributes) const {
16938 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16939 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16940 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16941 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16942 // taken one instruction to materialize the v2i64 zero and one store (with
16943 // restrictive addressing mode). Just do i64 stores.
16944 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16945 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16946 if (Op.isAligned(AlignCheck))
16947 return true;
16948 unsigned Fast;
16949 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16951 Fast;
16952 };
16953
16954 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16955 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16956 return MVT::v16i8;
16957 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16958 return MVT::f128;
16959 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16960 return MVT::i64;
16961 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16962 return MVT::i32;
16963 return MVT::Other;
16964}
16965
16967 const MemOp &Op, const AttributeList &FuncAttributes) const {
16968 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16969 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16970 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16971 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16972 // taken one instruction to materialize the v2i64 zero and one store (with
16973 // restrictive addressing mode). Just do i64 stores.
16974 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16975 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16976 if (Op.isAligned(AlignCheck))
16977 return true;
16978 unsigned Fast;
16979 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16981 Fast;
16982 };
16983
16984 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16985 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16986 return LLT::fixed_vector(2, 64);
16987 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16988 return LLT::scalar(128);
16989 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16990 return LLT::scalar(64);
16991 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16992 return LLT::scalar(32);
16993 return LLT();
16994}
16995
16996// 12-bit optionally shifted immediates are legal for adds.
16998 if (Immed == std::numeric_limits<int64_t>::min()) {
16999 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17000 << ": avoid UB for INT64_MIN\n");
17001 return false;
17002 }
17003 // Same encoding for add/sub, just flip the sign.
17004 Immed = std::abs(Immed);
17005 bool IsLegal = ((Immed >> 12) == 0 ||
17006 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17007 LLVM_DEBUG(dbgs() << "Is " << Immed
17008 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17009 return IsLegal;
17010}
17011
17013 // We will only emit addvl/inc* instructions for SVE2
17014 if (!Subtarget->hasSVE2())
17015 return false;
17016
17017 // addvl's immediates are in terms of the number of bytes in a register.
17018 // Since there are 16 in the base supported size (128bits), we need to
17019 // divide the immediate by that much to give us a useful immediate to
17020 // multiply by vscale. We can't have a remainder as a result of this.
17021 if (Imm % 16 == 0)
17022 return isInt<6>(Imm / 16);
17023
17024 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17025 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17026 // of addvl as a result, so only take h|w|d into account.
17027 // Dec[h|w|d] will cover subtractions.
17028 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17029 // FIXME: Can we make use of other patterns to cover other immediates?
17030
17031 // inch|dech
17032 if (Imm % 8 == 0)
17033 return std::abs(Imm / 8) <= 16;
17034 // incw|decw
17035 if (Imm % 4 == 0)
17036 return std::abs(Imm / 4) <= 16;
17037 // incd|decd
17038 if (Imm % 2 == 0)
17039 return std::abs(Imm / 2) <= 16;
17040
17041 return false;
17042}
17043
17044// Return false to prevent folding
17045// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17046// if the folding leads to worse code.
17048 SDValue AddNode, SDValue ConstNode) const {
17049 // Let the DAGCombiner decide for vector types and large types.
17050 const EVT VT = AddNode.getValueType();
17051 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17052 return true;
17053
17054 // It is worse if c1 is legal add immediate, while c1*c2 is not
17055 // and has to be composed by at least two instructions.
17056 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17057 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17058 const int64_t C1 = C1Node->getSExtValue();
17059 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17061 return true;
17063 // Adapt to the width of a register.
17064 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17066 if (Insn.size() > 1)
17067 return false;
17068
17069 // Default to true and let the DAGCombiner decide.
17070 return true;
17071}
17072
17073// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17074// immediates is the same as for an add or a sub.
17076 return isLegalAddImmediate(Immed);
17077}
17078
17079/// isLegalAddressingMode - Return true if the addressing mode represented
17080/// by AM is legal for this target, for a load/store of the specified type.
17082 const AddrMode &AMode, Type *Ty,
17083 unsigned AS, Instruction *I) const {
17084 // AArch64 has five basic addressing modes:
17085 // reg
17086 // reg + 9-bit signed offset
17087 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17088 // reg1 + reg2
17089 // reg + SIZE_IN_BYTES * reg
17090
17091 // No global is ever allowed as a base.
17092 if (AMode.BaseGV)
17093 return false;
17094
17095 // No reg+reg+imm addressing.
17096 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17097 return false;
17098
17099 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17100 // `2*ScaledReg` into `BaseReg + ScaledReg`
17101 AddrMode AM = AMode;
17102 if (AM.Scale && !AM.HasBaseReg) {
17103 if (AM.Scale == 1) {
17104 AM.HasBaseReg = true;
17105 AM.Scale = 0;
17106 } else if (AM.Scale == 2) {
17107 AM.HasBaseReg = true;
17108 AM.Scale = 1;
17109 } else {
17110 return false;
17111 }
17112 }
17113
17114 // A base register is required in all addressing modes.
17115 if (!AM.HasBaseReg)
17116 return false;
17117
17118 if (Ty->isScalableTy()) {
17119 if (isa<ScalableVectorType>(Ty)) {
17120 // See if we have a foldable vscale-based offset, for vector types which
17121 // are either legal or smaller than the minimum; more work will be
17122 // required if we need to consider addressing for types which need
17123 // legalization by splitting.
17124 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17125 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17126 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17127 isPowerOf2_64(VecNumBytes))
17128 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17129
17130 uint64_t VecElemNumBytes =
17131 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17132 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17133 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17134 }
17135
17136 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17137 }
17138
17139 // No scalable offsets allowed for non-scalable types.
17140 if (AM.ScalableOffset)
17141 return false;
17142
17143 // check reg + imm case:
17144 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17145 uint64_t NumBytes = 0;
17146 if (Ty->isSized()) {
17147 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17148 NumBytes = NumBits / 8;
17149 if (!isPowerOf2_64(NumBits))
17150 NumBytes = 0;
17151 }
17152
17153 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
17154 AM.Scale);
17155}
17156
17157// Check whether the 2 offsets belong to the same imm24 range, and their high
17158// 12bits are same, then their high part can be decoded with the offset of add.
17159int64_t
17161 int64_t MaxOffset) const {
17162 int64_t HighPart = MinOffset & ~0xfffULL;
17163 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
17164 // Rebase the value to an integer multiple of imm12.
17165 return HighPart;
17166 }
17167
17168 return 0;
17169}
17170
17172 // Consider splitting large offset of struct or array.
17173 return true;
17174}
17175
17177 const MachineFunction &MF, EVT VT) const {
17178 VT = VT.getScalarType();
17179
17180 if (!VT.isSimple())
17181 return false;
17182
17183 switch (VT.getSimpleVT().SimpleTy) {
17184 case MVT::f16:
17185 return Subtarget->hasFullFP16();
17186 case MVT::f32:
17187 case MVT::f64:
17188 return true;
17189 default:
17190 break;
17191 }
17192
17193 return false;
17194}
17195
17197 Type *Ty) const {
17198 switch (Ty->getScalarType()->getTypeID()) {
17199 case Type::FloatTyID:
17200 case Type::DoubleTyID:
17201 return true;
17202 default:
17203 return false;
17204 }
17205}
17206
17208 EVT VT, CodeGenOptLevel OptLevel) const {
17209 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17211}
17212
17213const MCPhysReg *
17215 // LR is a callee-save register, but we must treat it as clobbered by any call
17216 // site. Hence we include LR in the scratch registers, which are in turn added
17217 // as implicit-defs for stackmaps and patchpoints.
17218 static const MCPhysReg ScratchRegs[] = {
17219 AArch64::X16, AArch64::X17, AArch64::LR, 0
17220 };
17221 return ScratchRegs;
17222}
17223
17225 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17226 return RCRegs;
17227}
17228
17229bool
17231 CombineLevel Level) const {
17232 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17233 N->getOpcode() == ISD::SRL) &&
17234 "Expected shift op");
17235
17236 SDValue ShiftLHS = N->getOperand(0);
17237 EVT VT = N->getValueType(0);
17238
17239 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17240 // combine it with shift 'N' to let it be lowered to UBFX except:
17241 // ((x >> C) & mask) << C.
17242 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17243 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
17244 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
17245 if (isMask_64(TruncMask)) {
17246 SDValue AndLHS = ShiftLHS.getOperand(0);
17247 if (AndLHS.getOpcode() == ISD::SRL) {
17248 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
17249 if (N->getOpcode() == ISD::SHL)
17250 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
17251 return SRLC->getZExtValue() == SHLC->getZExtValue();
17252 return false;
17253 }
17254 }
17255 }
17256 }
17257 return true;
17258}
17259
17261 const SDNode *N) const {
17262 assert(N->getOpcode() == ISD::XOR &&
17263 (N->getOperand(0).getOpcode() == ISD::SHL ||
17264 N->getOperand(0).getOpcode() == ISD::SRL) &&
17265 "Expected XOR(SHIFT) pattern");
17266
17267 // Only commute if the entire NOT mask is a hidden shifted mask.
17268 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
17269 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17270 if (XorC && ShiftC) {
17271 unsigned MaskIdx, MaskLen;
17272 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17273 unsigned ShiftAmt = ShiftC->getZExtValue();
17274 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17275 if (N->getOperand(0).getOpcode() == ISD::SHL)
17276 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17277 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17278 }
17279 }
17280
17281 return false;
17282}
17283
17285 const SDNode *N, CombineLevel Level) const {
17286 assert(((N->getOpcode() == ISD::SHL &&
17287 N->getOperand(0).getOpcode() == ISD::SRL) ||
17288 (N->getOpcode() == ISD::SRL &&
17289 N->getOperand(0).getOpcode() == ISD::SHL)) &&
17290 "Expected shift-shift mask");
17291 // Don't allow multiuse shift folding with the same shift amount.
17292 if (!N->getOperand(0)->hasOneUse())
17293 return false;
17294
17295 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17296 EVT VT = N->getValueType(0);
17297 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17298 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17299 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17300 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17301 }
17302
17303 return true;
17304}
17305
17307 unsigned BinOpcode, EVT VT) const {
17308 return VT.isScalableVector() && isTypeLegal(VT);
17309}
17310
17312 Type *Ty) const {
17313 assert(Ty->isIntegerTy());
17314
17315 unsigned BitSize = Ty->getPrimitiveSizeInBits();
17316 if (BitSize == 0)
17317 return false;
17318
17319 int64_t Val = Imm.getSExtValue();
17320 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
17321 return true;
17322
17323 if ((int64_t)Val < 0)
17324 Val = ~Val;
17325 if (BitSize == 32)
17326 Val &= (1LL << 32) - 1;
17327
17328 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
17329 // MOVZ is free so return true for one or fewer MOVK.
17330 return Shift < 3;
17331}
17332
17334 unsigned Index) const {
17336 return false;
17337
17338 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
17339}
17340
17341/// Turn vector tests of the signbit in the form of:
17342/// xor (sra X, elt_size(X)-1), -1
17343/// into:
17344/// cmge X, X, #0
17346 const AArch64Subtarget *Subtarget) {
17347 EVT VT = N->getValueType(0);
17348 if (!Subtarget->hasNEON() || !VT.isVector())
17349 return SDValue();
17350
17351 // There must be a shift right algebraic before the xor, and the xor must be a
17352 // 'not' operation.
17353 SDValue Shift = N->getOperand(0);
17354 SDValue Ones = N->getOperand(1);
17355 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
17357 return SDValue();
17358
17359 // The shift should be smearing the sign bit across each vector element.
17360 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
17361 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
17362 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
17363 return SDValue();
17364
17365 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
17366}
17367
17368// Given a vecreduce_add node, detect the below pattern and convert it to the
17369// node sequence with UABDL, [S|U]ADB and UADDLP.
17370//
17371// i32 vecreduce_add(
17372// v16i32 abs(
17373// v16i32 sub(
17374// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
17375// =================>
17376// i32 vecreduce_add(
17377// v4i32 UADDLP(
17378// v8i16 add(
17379// v8i16 zext(
17380// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
17381// v8i16 zext(
17382// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17384 SelectionDAG &DAG) {
17385 // Assumed i32 vecreduce_add
17386 if (N->getValueType(0) != MVT::i32)
17387 return SDValue();
17388
17389 SDValue VecReduceOp0 = N->getOperand(0);
17390 unsigned Opcode = VecReduceOp0.getOpcode();
17391 // Assumed v16i32 abs
17392 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17393 return SDValue();
17394
17395 SDValue ABS = VecReduceOp0;
17396 // Assumed v16i32 sub
17397 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17398 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17399 return SDValue();
17400
17401 SDValue SUB = ABS->getOperand(0);
17402 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17403 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17404 // Assumed v16i32 type
17405 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17406 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17407 return SDValue();
17408
17409 // Assumed zext or sext
17410 bool IsZExt = false;
17411 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17412 IsZExt = true;
17413 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17414 IsZExt = false;
17415 } else
17416 return SDValue();
17417
17418 SDValue EXT0 = SUB->getOperand(0);
17419 SDValue EXT1 = SUB->getOperand(1);
17420 // Assumed zext's operand has v16i8 type
17421 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17422 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17423 return SDValue();
17424
17425 // Pattern is dectected. Let's convert it to sequence of nodes.
17426 SDLoc DL(N);
17427
17428 // First, create the node pattern of UABD/SABD.
17429 SDValue UABDHigh8Op0 =
17430 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17431 DAG.getConstant(8, DL, MVT::i64));
17432 SDValue UABDHigh8Op1 =
17433 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17434 DAG.getConstant(8, DL, MVT::i64));
17435 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17436 UABDHigh8Op0, UABDHigh8Op1);
17437 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17438
17439 // Second, create the node pattern of UABAL.
17440 SDValue UABDLo8Op0 =
17441 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17442 DAG.getConstant(0, DL, MVT::i64));
17443 SDValue UABDLo8Op1 =
17444 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17445 DAG.getConstant(0, DL, MVT::i64));
17446 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17447 UABDLo8Op0, UABDLo8Op1);
17448 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17449 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17450
17451 // Third, create the node of UADDLP.
17452 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17453
17454 // Fourth, create the node of VECREDUCE_ADD.
17455 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17456}
17457
17458// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17459// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17460// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17461// If we have vectors larger than v16i8 we extract v16i8 vectors,
17462// Follow the same steps above to get DOT instructions concatenate them
17463// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17465 const AArch64Subtarget *ST) {
17466 if (!ST->hasDotProd())
17468
17469 SDValue Op0 = N->getOperand(0);
17470 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17471 Op0.getValueType().getVectorElementType() != MVT::i32)
17472 return SDValue();
17473
17474 unsigned ExtOpcode = Op0.getOpcode();
17475 SDValue A = Op0;
17476 SDValue B;
17477 if (ExtOpcode == ISD::MUL) {
17478 A = Op0.getOperand(0);
17479 B = Op0.getOperand(1);
17480 if (A.getOpcode() != B.getOpcode() ||
17481 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17482 return SDValue();
17483 ExtOpcode = A.getOpcode();
17484 }
17485 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17486 return SDValue();
17487
17488 EVT Op0VT = A.getOperand(0).getValueType();
17489 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17490 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17491 if (!IsValidElementCount || !IsValidSize)
17492 return SDValue();
17493
17494 SDLoc DL(Op0);
17495 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17496 // the extend B.
17497 if (!B)
17498 B = DAG.getConstant(1, DL, Op0VT);
17499 else
17500 B = B.getOperand(0);
17501
17502 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17503 unsigned NumOfVecReduce;
17504 EVT TargetType;
17505 if (IsMultipleOf16) {
17506 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17507 TargetType = MVT::v4i32;
17508 } else {
17509 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17510 TargetType = MVT::v2i32;
17511 }
17512 auto DotOpcode =
17514 // Handle the case where we need to generate only one Dot operation.
17515 if (NumOfVecReduce == 1) {
17516 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17517 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17518 A.getOperand(0), B);
17519 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17520 }
17521 // Generate Dot instructions that are multiple of 16.
17522 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17523 SmallVector<SDValue, 4> SDotVec16;
17524 unsigned I = 0;
17525 for (; I < VecReduce16Num; I += 1) {
17526 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17527 SDValue Op0 =
17528 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17529 DAG.getConstant(I * 16, DL, MVT::i64));
17530 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17531 DAG.getConstant(I * 16, DL, MVT::i64));
17532 SDValue Dot =
17533 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17534 SDotVec16.push_back(Dot);
17535 }
17536 // Concatenate dot operations.
17537 EVT SDot16EVT =
17538 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17539 SDValue ConcatSDot16 =
17540 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17541 SDValue VecReduceAdd16 =
17542 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17543 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17544 if (VecReduce8Num == 0)
17545 return VecReduceAdd16;
17546
17547 // Generate the remainder Dot operation that is multiple of 8.
17548 SmallVector<SDValue, 4> SDotVec8;
17549 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17550 SDValue Vec8Op0 =
17551 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17552 DAG.getConstant(I * 16, DL, MVT::i64));
17553 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17554 DAG.getConstant(I * 16, DL, MVT::i64));
17555 SDValue Dot =
17556 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17557 SDValue VecReudceAdd8 =
17558 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17559 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17560 VecReudceAdd8);
17561}
17562
17563// Given an (integer) vecreduce, we know the order of the inputs does not
17564// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17565// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17566// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17568 auto DetectAddExtract = [&](SDValue A) {
17569 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17570 // UADDLP(x) if found.
17571 assert(A.getOpcode() == ISD::ADD);
17572 EVT VT = A.getValueType();
17573 SDValue Op0 = A.getOperand(0);
17574 SDValue Op1 = A.getOperand(1);
17575 if (Op0.getOpcode() != Op0.getOpcode() ||
17576 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17577 Op0.getOpcode() != ISD::SIGN_EXTEND))
17578 return SDValue();
17579 SDValue Ext0 = Op0.getOperand(0);
17580 SDValue Ext1 = Op1.getOperand(0);
17581 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17583 Ext0.getOperand(0) != Ext1.getOperand(0))
17584 return SDValue();
17585 // Check that the type is twice the add types, and the extract are from
17586 // upper/lower parts of the same source.
17588 VT.getVectorNumElements() * 2)
17589 return SDValue();
17590 if ((Ext0.getConstantOperandVal(1) != 0 ||
17592 (Ext1.getConstantOperandVal(1) != 0 ||
17594 return SDValue();
17595 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17597 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17598 };
17599
17600 if (SDValue R = DetectAddExtract(A))
17601 return R;
17602
17603 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17604 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17605 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17606 A.getOperand(1));
17607 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17608 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17609 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17610 A.getOperand(0));
17611 return SDValue();
17612}
17613
17614// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17615// UADDLV(concat), where the concat represents the 64-bit zext sources.
17617 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17618 // UADDLV(concat(zext, zext)) if found.
17619 assert(A.getOpcode() == ISD::ADD);
17620 EVT VT = A.getValueType();
17621 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17622 return SDValue();
17623 SDValue Op0 = A.getOperand(0);
17624 SDValue Op1 = A.getOperand(1);
17625 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17626 return SDValue();
17627 SDValue Ext0 = Op0.getOperand(0);
17628 SDValue Ext1 = Op1.getOperand(0);
17629 EVT ExtVT0 = Ext0.getValueType();
17630 EVT ExtVT1 = Ext1.getValueType();
17631 // Check zext VTs are the same and 64-bit length.
17632 if (ExtVT0 != ExtVT1 ||
17633 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17634 return SDValue();
17635 // Get VT for concat of zext sources.
17636 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17637 SDValue Concat =
17638 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17639
17640 switch (VT.getSimpleVT().SimpleTy) {
17641 case MVT::v2i64:
17642 case MVT::v4i32:
17643 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17644 case MVT::v8i16: {
17645 SDValue Uaddlv =
17646 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17647 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17648 }
17649 default:
17650 llvm_unreachable("Unhandled vector type");
17651 }
17652}
17653
17655 SDValue A = N->getOperand(0);
17656 if (A.getOpcode() == ISD::ADD) {
17657 if (SDValue R = performUADDVAddCombine(A, DAG))
17658 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17659 else if (SDValue R = performUADDVZextCombine(A, DAG))
17660 return R;
17661 }
17662 return SDValue();
17663}
17664
17667 const AArch64Subtarget *Subtarget) {
17668 if (DCI.isBeforeLegalizeOps())
17669 return SDValue();
17670
17671 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17672}
17673
17674SDValue
17675AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17676 SelectionDAG &DAG,
17677 SmallVectorImpl<SDNode *> &Created) const {
17679 if (isIntDivCheap(N->getValueType(0), Attr))
17680 return SDValue(N,0); // Lower SDIV as SDIV
17681
17682 EVT VT = N->getValueType(0);
17683
17684 // For scalable and fixed types, mark them as cheap so we can handle it much
17685 // later. This allows us to handle larger than legal types.
17686 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17687 return SDValue(N, 0);
17688
17689 // fold (sdiv X, pow2)
17690 if ((VT != MVT::i32 && VT != MVT::i64) ||
17691 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17692 return SDValue();
17693
17694 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17695}
17696
17697SDValue
17698AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17699 SelectionDAG &DAG,
17700 SmallVectorImpl<SDNode *> &Created) const {
17702 if (isIntDivCheap(N->getValueType(0), Attr))
17703 return SDValue(N, 0); // Lower SREM as SREM
17704
17705 EVT VT = N->getValueType(0);
17706
17707 // For scalable and fixed types, mark them as cheap so we can handle it much
17708 // later. This allows us to handle larger than legal types.
17709 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17710 return SDValue(N, 0);
17711
17712 // fold (srem X, pow2)
17713 if ((VT != MVT::i32 && VT != MVT::i64) ||
17714 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17715 return SDValue();
17716
17717 unsigned Lg2 = Divisor.countr_zero();
17718 if (Lg2 == 0)
17719 return SDValue();
17720
17721 SDLoc DL(N);
17722 SDValue N0 = N->getOperand(0);
17723 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17724 SDValue Zero = DAG.getConstant(0, DL, VT);
17725 SDValue CCVal, CSNeg;
17726 if (Lg2 == 1) {
17727 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17728 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17729 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17730
17731 Created.push_back(Cmp.getNode());
17732 Created.push_back(And.getNode());
17733 } else {
17734 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17735 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17736
17737 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17738 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17739 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17740 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17741 Negs.getValue(1));
17742
17743 Created.push_back(Negs.getNode());
17744 Created.push_back(AndPos.getNode());
17745 Created.push_back(AndNeg.getNode());
17746 }
17747
17748 return CSNeg;
17749}
17750
17751static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17752 switch(getIntrinsicID(S.getNode())) {
17753 default:
17754 break;
17755 case Intrinsic::aarch64_sve_cntb:
17756 return 8;
17757 case Intrinsic::aarch64_sve_cnth:
17758 return 16;
17759 case Intrinsic::aarch64_sve_cntw:
17760 return 32;
17761 case Intrinsic::aarch64_sve_cntd:
17762 return 64;
17763 }
17764 return {};
17765}
17766
17767/// Calculates what the pre-extend type is, based on the extension
17768/// operation node provided by \p Extend.
17769///
17770/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17771/// pre-extend type is pulled directly from the operand, while other extend
17772/// operations need a bit more inspection to get this information.
17773///
17774/// \param Extend The SDNode from the DAG that represents the extend operation
17775///
17776/// \returns The type representing the \p Extend source type, or \p MVT::Other
17777/// if no valid type can be determined
17779 switch (Extend.getOpcode()) {
17780 case ISD::SIGN_EXTEND:
17781 case ISD::ZERO_EXTEND:
17782 return Extend.getOperand(0).getValueType();
17783 case ISD::AssertSext:
17784 case ISD::AssertZext:
17786 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17787 if (!TypeNode)
17788 return MVT::Other;
17789 return TypeNode->getVT();
17790 }
17791 case ISD::AND: {
17793 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17794 if (!Constant)
17795 return MVT::Other;
17796
17797 uint32_t Mask = Constant->getZExtValue();
17798
17799 if (Mask == UCHAR_MAX)
17800 return MVT::i8;
17801 else if (Mask == USHRT_MAX)
17802 return MVT::i16;
17803 else if (Mask == UINT_MAX)
17804 return MVT::i32;
17805
17806 return MVT::Other;
17807 }
17808 default:
17809 return MVT::Other;
17810 }
17811}
17812
17813/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17814/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17815/// SExt/ZExt rather than the scalar SExt/ZExt
17817 EVT VT = BV.getValueType();
17818 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17820 return SDValue();
17821
17822 // Use the first item in the buildvector/shuffle to get the size of the
17823 // extend, and make sure it looks valid.
17824 SDValue Extend = BV->getOperand(0);
17825 unsigned ExtendOpcode = Extend.getOpcode();
17826 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17827 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17828 ExtendOpcode == ISD::AssertSext;
17829 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17830 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17831 return SDValue();
17832 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17833 // calculatePreExtendType will work without issue.
17834 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17835 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17836 return SDValue();
17837
17838 // Restrict valid pre-extend data type
17839 EVT PreExtendType = calculatePreExtendType(Extend);
17840 if (PreExtendType == MVT::Other ||
17841 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17842 return SDValue();
17843
17844 // Make sure all other operands are equally extended
17845 for (SDValue Op : drop_begin(BV->ops())) {
17846 if (Op.isUndef())
17847 continue;
17848 unsigned Opc = Op.getOpcode();
17849 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17850 Opc == ISD::AssertSext;
17851 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17852 return SDValue();
17853 }
17854
17855 SDValue NBV;
17856 SDLoc DL(BV);
17857 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17858 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17859 EVT PreExtendLegalType =
17860 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17862 for (SDValue Op : BV->ops())
17863 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17864 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17865 PreExtendLegalType));
17866 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17867 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17868 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17869 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17870 BV.getOperand(1).isUndef()
17871 ? DAG.getUNDEF(PreExtendVT)
17872 : BV.getOperand(1).getOperand(0),
17873 cast<ShuffleVectorSDNode>(BV)->getMask());
17874 }
17875 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17876}
17877
17878/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17879/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17881 // If the value type isn't a vector, none of the operands are going to be dups
17882 EVT VT = Mul->getValueType(0);
17883 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17884 return SDValue();
17885
17886 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17887 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17888
17889 // Neither operands have been changed, don't make any further changes
17890 if (!Op0 && !Op1)
17891 return SDValue();
17892
17893 SDLoc DL(Mul);
17894 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17895 Op1 ? Op1 : Mul->getOperand(1));
17896}
17897
17898// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17899// Same for other types with equivalent constants.
17901 EVT VT = N->getValueType(0);
17902 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17903 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17904 return SDValue();
17905 if (N->getOperand(0).getOpcode() != ISD::AND ||
17906 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17907 return SDValue();
17908
17909 SDValue And = N->getOperand(0);
17910 SDValue Srl = And.getOperand(0);
17911
17912 APInt V1, V2, V3;
17913 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17914 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17916 return SDValue();
17917
17918 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17919 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17920 V3 != (HalfSize - 1))
17921 return SDValue();
17922
17923 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17924 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17925 VT.getVectorElementCount() * 2);
17926
17927 SDLoc DL(N);
17928 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17929 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17930 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17931}
17932
17933// Transform vector add(zext i8 to i32, zext i8 to i32)
17934// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
17935// This allows extra uses of saddl/uaddl at the lower vector widths, and less
17936// extends.
17938 EVT VT = N->getValueType(0);
17939 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
17940 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
17941 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
17942 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
17943 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
17944 N->getOperand(0).getOperand(0).getValueType() !=
17945 N->getOperand(1).getOperand(0).getValueType())
17946 return SDValue();
17947
17948 if (N->getOpcode() == ISD::MUL &&
17949 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
17950 return SDValue();
17951
17952 SDValue N0 = N->getOperand(0).getOperand(0);
17953 SDValue N1 = N->getOperand(1).getOperand(0);
17954 EVT InVT = N0.getValueType();
17955
17956 EVT S1 = InVT.getScalarType();
17957 EVT S2 = VT.getScalarType();
17958 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
17959 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
17960 SDLoc DL(N);
17961 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17964 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
17965 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
17966 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
17967 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
17968 : (unsigned)ISD::SIGN_EXTEND,
17969 DL, VT, NewOp);
17970 }
17971 return SDValue();
17972}
17973
17976 const AArch64Subtarget *Subtarget) {
17977
17978 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17979 return Ext;
17981 return Ext;
17982 if (SDValue Ext = performVectorExtCombine(N, DAG))
17983 return Ext;
17984
17985 if (DCI.isBeforeLegalizeOps())
17986 return SDValue();
17987
17988 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17989 // and in MachineCombiner pass, add+mul will be combined into madd.
17990 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17991 SDLoc DL(N);
17992 EVT VT = N->getValueType(0);
17993 SDValue N0 = N->getOperand(0);
17994 SDValue N1 = N->getOperand(1);
17995 SDValue MulOper;
17996 unsigned AddSubOpc;
17997
17998 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17999 AddSubOpc = V->getOpcode();
18000 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18001 SDValue Opnd = V->getOperand(1);
18002 MulOper = V->getOperand(0);
18003 if (AddSubOpc == ISD::SUB)
18004 std::swap(Opnd, MulOper);
18005 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18006 return C->isOne();
18007 }
18008 return false;
18009 };
18010
18011 if (IsAddSubWith1(N0)) {
18012 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18013 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18014 }
18015
18016 if (IsAddSubWith1(N1)) {
18017 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18018 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18019 }
18020
18021 // The below optimizations require a constant RHS.
18022 if (!isa<ConstantSDNode>(N1))
18023 return SDValue();
18024
18025 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18026 const APInt &ConstValue = C->getAPIntValue();
18027
18028 // Allow the scaling to be folded into the `cnt` instruction by preventing
18029 // the scaling to be obscured here. This makes it easier to pattern match.
18030 if (IsSVECntIntrinsic(N0) ||
18031 (N0->getOpcode() == ISD::TRUNCATE &&
18032 (IsSVECntIntrinsic(N0->getOperand(0)))))
18033 if (ConstValue.sge(1) && ConstValue.sle(16))
18034 return SDValue();
18035
18036 // Multiplication of a power of two plus/minus one can be done more
18037 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18038 // future CPUs have a cheaper MADD instruction, this may need to be
18039 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18040 // 64-bit is 5 cycles, so this is always a win.
18041 // More aggressively, some multiplications N0 * C can be lowered to
18042 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18043 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18044 // TODO: lower more cases.
18045
18046 // TrailingZeroes is used to test if the mul can be lowered to
18047 // shift+add+shift.
18048 unsigned TrailingZeroes = ConstValue.countr_zero();
18049 if (TrailingZeroes) {
18050 // Conservatively do not lower to shift+add+shift if the mul might be
18051 // folded into smul or umul.
18052 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18053 isZeroExtended(N0, DAG)))
18054 return SDValue();
18055 // Conservatively do not lower to shift+add+shift if the mul might be
18056 // folded into madd or msub.
18057 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
18058 N->use_begin()->getOpcode() == ISD::SUB))
18059 return SDValue();
18060 }
18061 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18062 // and shift+add+shift.
18063 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18064 unsigned ShiftAmt;
18065
18066 auto Shl = [&](SDValue N0, unsigned N1) {
18067 if (!N0.getNode())
18068 return SDValue();
18069 // If shift causes overflow, ignore this combine.
18070 if (N1 >= N0.getValueSizeInBits())
18071 return SDValue();
18072 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18073 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18074 };
18075 auto Add = [&](SDValue N0, SDValue N1) {
18076 if (!N0.getNode() || !N1.getNode())
18077 return SDValue();
18078 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
18079 };
18080 auto Sub = [&](SDValue N0, SDValue N1) {
18081 if (!N0.getNode() || !N1.getNode())
18082 return SDValue();
18083 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
18084 };
18085 auto Negate = [&](SDValue N) {
18086 if (!N0.getNode())
18087 return SDValue();
18088 SDValue Zero = DAG.getConstant(0, DL, VT);
18089 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
18090 };
18091
18092 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18093 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18094 // the (2^N - 1) can't be execused via a single instruction.
18095 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18096 unsigned BitWidth = C.getBitWidth();
18097 for (unsigned i = 1; i < BitWidth / 2; i++) {
18098 APInt Rem;
18099 APInt X(BitWidth, (1 << i) + 1);
18100 APInt::sdivrem(C, X, N, Rem);
18101 APInt NVMinus1 = N - 1;
18102 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18103 M = X;
18104 return true;
18105 }
18106 }
18107 return false;
18108 };
18109
18110 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18111 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18112 // the (2^N - 1) can't be execused via a single instruction.
18113 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18114 APInt CVMinus1 = C - 1;
18115 if (CVMinus1.isNegative())
18116 return false;
18117 unsigned TrailingZeroes = CVMinus1.countr_zero();
18118 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
18119 if (SCVMinus1.isPowerOf2()) {
18120 unsigned BitWidth = SCVMinus1.getBitWidth();
18121 M = APInt(BitWidth, SCVMinus1.logBase2());
18122 N = APInt(BitWidth, TrailingZeroes);
18123 return true;
18124 }
18125 return false;
18126 };
18127
18128 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18129 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18130 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18131 APInt CVMinus1 = C - 1;
18132 if (CVMinus1.isNegative())
18133 return false;
18134 unsigned TrailingZeroes = CVMinus1.countr_zero();
18135 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
18136 if (CVPlus1.isPowerOf2()) {
18137 unsigned BitWidth = CVPlus1.getBitWidth();
18138 M = APInt(BitWidth, CVPlus1.logBase2());
18139 N = APInt(BitWidth, TrailingZeroes);
18140 return true;
18141 }
18142 return false;
18143 };
18144
18145 if (ConstValue.isNonNegative()) {
18146 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18147 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18148 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18149 // (mul x, (2^M + 1) * (2^N + 1))
18150 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18151 // (mul x, (2^M + 1) * 2^N + 1))
18152 // => MV = add (shl x, M), x); add (shl MV, N), x)
18153 // (mul x, 1 - (1 - 2^M) * 2^N))
18154 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18155 APInt SCVMinus1 = ShiftedConstValue - 1;
18156 APInt SCVPlus1 = ShiftedConstValue + 1;
18157 APInt CVPlus1 = ConstValue + 1;
18158 APInt CVM, CVN;
18159 if (SCVMinus1.isPowerOf2()) {
18160 ShiftAmt = SCVMinus1.logBase2();
18161 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18162 } else if (CVPlus1.isPowerOf2()) {
18163 ShiftAmt = CVPlus1.logBase2();
18164 return Sub(Shl(N0, ShiftAmt), N0);
18165 } else if (SCVPlus1.isPowerOf2()) {
18166 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18167 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18168 }
18169 if (Subtarget->hasALULSLFast() &&
18170 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18171 APInt CVMMinus1 = CVM - 1;
18172 APInt CVNMinus1 = CVN - 1;
18173 unsigned ShiftM1 = CVMMinus1.logBase2();
18174 unsigned ShiftN1 = CVNMinus1.logBase2();
18175 // ALULSLFast implicate that Shifts <= 4 places are fast
18176 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18177 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18178 return Add(Shl(MVal, ShiftN1), MVal);
18179 }
18180 }
18181 if (Subtarget->hasALULSLFast() &&
18182 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18183 unsigned ShiftM = CVM.getZExtValue();
18184 unsigned ShiftN = CVN.getZExtValue();
18185 // ALULSLFast implicate that Shifts <= 4 places are fast
18186 if (ShiftM <= 4 && ShiftN <= 4) {
18187 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18188 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18189 }
18190 }
18191
18192 if (Subtarget->hasALULSLFast() &&
18193 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18194 unsigned ShiftM = CVM.getZExtValue();
18195 unsigned ShiftN = CVN.getZExtValue();
18196 // ALULSLFast implicate that Shifts <= 4 places are fast
18197 if (ShiftM <= 4 && ShiftN <= 4) {
18198 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18199 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18200 }
18201 }
18202 } else {
18203 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18204 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18205 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18206 APInt SCVPlus1 = -ShiftedConstValue + 1;
18207 APInt CVNegPlus1 = -ConstValue + 1;
18208 APInt CVNegMinus1 = -ConstValue - 1;
18209 if (CVNegPlus1.isPowerOf2()) {
18210 ShiftAmt = CVNegPlus1.logBase2();
18211 return Sub(N0, Shl(N0, ShiftAmt));
18212 } else if (CVNegMinus1.isPowerOf2()) {
18213 ShiftAmt = CVNegMinus1.logBase2();
18214 return Negate(Add(Shl(N0, ShiftAmt), N0));
18215 } else if (SCVPlus1.isPowerOf2()) {
18216 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18217 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18218 }
18219 }
18220
18221 return SDValue();
18222}
18223
18225 SelectionDAG &DAG) {
18226 // Take advantage of vector comparisons producing 0 or -1 in each lane to
18227 // optimize away operation when it's from a constant.
18228 //
18229 // The general transformation is:
18230 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18231 // AND(VECTOR_CMP(x,y), constant2)
18232 // constant2 = UNARYOP(constant)
18233
18234 // Early exit if this isn't a vector operation, the operand of the
18235 // unary operation isn't a bitwise AND, or if the sizes of the operations
18236 // aren't the same.
18237 EVT VT = N->getValueType(0);
18238 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
18239 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
18240 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
18241 return SDValue();
18242
18243 // Now check that the other operand of the AND is a constant. We could
18244 // make the transformation for non-constant splats as well, but it's unclear
18245 // that would be a benefit as it would not eliminate any operations, just
18246 // perform one more step in scalar code before moving to the vector unit.
18247 if (BuildVectorSDNode *BV =
18248 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
18249 // Bail out if the vector isn't a constant.
18250 if (!BV->isConstant())
18251 return SDValue();
18252
18253 // Everything checks out. Build up the new and improved node.
18254 SDLoc DL(N);
18255 EVT IntVT = BV->getValueType(0);
18256 // Create a new constant of the appropriate type for the transformed
18257 // DAG.
18258 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
18259 // The AND node needs bitcasts to/from an integer vector type around it.
18260 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
18261 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
18262 N->getOperand(0)->getOperand(0), MaskConst);
18263 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
18264 return Res;
18265 }
18266
18267 return SDValue();
18268}
18269
18271 const AArch64Subtarget *Subtarget) {
18272 // First try to optimize away the conversion when it's conditionally from
18273 // a constant. Vectors only.
18275 return Res;
18276
18277 EVT VT = N->getValueType(0);
18278 if (VT != MVT::f32 && VT != MVT::f64)
18279 return SDValue();
18280
18281 // Only optimize when the source and destination types have the same width.
18282 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
18283 return SDValue();
18284
18285 // If the result of an integer load is only used by an integer-to-float
18286 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
18287 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
18288 SDValue N0 = N->getOperand(0);
18289 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
18290 N0.hasOneUse() &&
18291 // Do not change the width of a volatile load.
18292 !cast<LoadSDNode>(N0)->isVolatile()) {
18293 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
18294 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
18295 LN0->getPointerInfo(), LN0->getAlign(),
18296 LN0->getMemOperand()->getFlags());
18297
18298 // Make sure successors of the original load stay after it by updating them
18299 // to use the new Chain.
18300 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
18301
18302 unsigned Opcode =
18304 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
18305 }
18306
18307 return SDValue();
18308}
18309
18310/// Fold a floating-point multiply by power of two into floating-point to
18311/// fixed-point conversion.
18314 const AArch64Subtarget *Subtarget) {
18315 if (!Subtarget->isNeonAvailable())
18316 return SDValue();
18317
18318 if (!N->getValueType(0).isSimple())
18319 return SDValue();
18320
18321 SDValue Op = N->getOperand(0);
18322 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
18323 return SDValue();
18324
18325 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
18326 return SDValue();
18327
18328 SDValue ConstVec = Op->getOperand(1);
18329 if (!isa<BuildVectorSDNode>(ConstVec))
18330 return SDValue();
18331
18332 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
18333 uint32_t FloatBits = FloatTy.getSizeInBits();
18334 if (FloatBits != 32 && FloatBits != 64 &&
18335 (FloatBits != 16 || !Subtarget->hasFullFP16()))
18336 return SDValue();
18337
18338 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
18339 uint32_t IntBits = IntTy.getSizeInBits();
18340 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
18341 return SDValue();
18342
18343 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
18344 if (IntBits > FloatBits)
18345 return SDValue();
18346
18347 BitVector UndefElements;
18348 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
18349 int32_t Bits = IntBits == 64 ? 64 : 32;
18350 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
18351 if (C == -1 || C == 0 || C > Bits)
18352 return SDValue();
18353
18354 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
18355 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
18356 return SDValue();
18357
18358 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
18359 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
18360 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
18361 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
18362 return SDValue();
18363 }
18364
18365 SDLoc DL(N);
18366 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
18367 N->getOpcode() == ISD::FP_TO_SINT_SAT);
18368 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
18369 : Intrinsic::aarch64_neon_vcvtfp2fxu;
18370 SDValue FixConv =
18372 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
18373 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
18374 // We can handle smaller integers by generating an extra trunc.
18375 if (IntBits < FloatBits)
18376 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
18377
18378 return FixConv;
18379}
18380
18382 const AArch64TargetLowering &TLI) {
18383 EVT VT = N->getValueType(0);
18384 SelectionDAG &DAG = DCI.DAG;
18385 SDLoc DL(N);
18386 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18387
18388 if (!VT.isVector())
18389 return SDValue();
18390
18391 if (VT.isScalableVector() && !Subtarget.hasSVE2())
18392 return SDValue();
18393
18394 if (VT.isFixedLengthVector() &&
18395 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
18396 return SDValue();
18397
18398 SDValue N0 = N->getOperand(0);
18399 if (N0.getOpcode() != ISD::AND)
18400 return SDValue();
18401
18402 SDValue N1 = N->getOperand(1);
18403 if (N1.getOpcode() != ISD::AND)
18404 return SDValue();
18405
18406 // InstCombine does (not (neg a)) => (add a -1).
18407 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18408 // Loop over all combinations of AND operands.
18409 for (int i = 1; i >= 0; --i) {
18410 for (int j = 1; j >= 0; --j) {
18411 SDValue O0 = N0->getOperand(i);
18412 SDValue O1 = N1->getOperand(j);
18413 SDValue Sub, Add, SubSibling, AddSibling;
18414
18415 // Find a SUB and an ADD operand, one from each AND.
18416 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18417 Sub = O0;
18418 Add = O1;
18419 SubSibling = N0->getOperand(1 - i);
18420 AddSibling = N1->getOperand(1 - j);
18421 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18422 Add = O0;
18423 Sub = O1;
18424 AddSibling = N0->getOperand(1 - i);
18425 SubSibling = N1->getOperand(1 - j);
18426 } else
18427 continue;
18428
18430 continue;
18431
18432 // Constant ones is always righthand operand of the Add.
18433 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18434 continue;
18435
18436 if (Sub.getOperand(1) != Add.getOperand(0))
18437 continue;
18438
18439 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18440 }
18441 }
18442
18443 // (or (and a b) (and (not a) c)) => (bsl a b c)
18444 // We only have to look for constant vectors here since the general, variable
18445 // case can be handled in TableGen.
18446 unsigned Bits = VT.getScalarSizeInBits();
18447 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18448 for (int i = 1; i >= 0; --i)
18449 for (int j = 1; j >= 0; --j) {
18450 APInt Val1, Val2;
18451
18452 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18454 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18455 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18456 N0->getOperand(1 - i), N1->getOperand(1 - j));
18457 }
18458 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18459 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18460 if (!BVN0 || !BVN1)
18461 continue;
18462
18463 bool FoundMatch = true;
18464 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18465 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18466 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18467 if (!CN0 || !CN1 ||
18468 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18469 FoundMatch = false;
18470 break;
18471 }
18472 }
18473 if (FoundMatch)
18474 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18475 N0->getOperand(1 - i), N1->getOperand(1 - j));
18476 }
18477
18478 return SDValue();
18479}
18480
18481// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18482// convert to csel(ccmp(.., cc0)), depending on cc1:
18483
18484// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18485// =>
18486// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18487//
18488// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18489// =>
18490// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18492 EVT VT = N->getValueType(0);
18493 SDValue CSel0 = N->getOperand(0);
18494 SDValue CSel1 = N->getOperand(1);
18495
18496 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18497 CSel1.getOpcode() != AArch64ISD::CSEL)
18498 return SDValue();
18499
18500 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18501 return SDValue();
18502
18503 if (!isNullConstant(CSel0.getOperand(0)) ||
18504 !isOneConstant(CSel0.getOperand(1)) ||
18505 !isNullConstant(CSel1.getOperand(0)) ||
18506 !isOneConstant(CSel1.getOperand(1)))
18507 return SDValue();
18508
18509 SDValue Cmp0 = CSel0.getOperand(3);
18510 SDValue Cmp1 = CSel1.getOperand(3);
18513 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18514 return SDValue();
18515 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18516 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18517 std::swap(Cmp0, Cmp1);
18518 std::swap(CC0, CC1);
18519 }
18520
18521 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18522 return SDValue();
18523
18524 SDLoc DL(N);
18525 SDValue CCmp, Condition;
18526 unsigned NZCV;
18527
18528 if (N->getOpcode() == ISD::AND) {
18530 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18532 } else {
18534 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18536 }
18537
18538 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18539
18540 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18541 if (Op1 && Op1->getAPIntValue().isNegative() &&
18542 Op1->getAPIntValue().sgt(-32)) {
18543 // CCMP accept the constant int the range [0, 31]
18544 // if the Op1 is a constant in the range [-31, -1], we
18545 // can select to CCMN to avoid the extra mov
18546 SDValue AbsOp1 =
18547 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18548 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18549 NZCVOp, Condition, Cmp0);
18550 } else {
18551 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18552 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18553 }
18554 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18555 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18556 CCmp);
18557}
18558
18560 const AArch64Subtarget *Subtarget,
18561 const AArch64TargetLowering &TLI) {
18562 SelectionDAG &DAG = DCI.DAG;
18563 EVT VT = N->getValueType(0);
18564
18565 if (SDValue R = performANDORCSELCombine(N, DAG))
18566 return R;
18567
18568 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18569 return SDValue();
18570
18571 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18572 return Res;
18573
18574 return SDValue();
18575}
18576
18578 if (!MemVT.getVectorElementType().isSimple())
18579 return false;
18580
18581 uint64_t MaskForTy = 0ull;
18582 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18583 case MVT::i8:
18584 MaskForTy = 0xffull;
18585 break;
18586 case MVT::i16:
18587 MaskForTy = 0xffffull;
18588 break;
18589 case MVT::i32:
18590 MaskForTy = 0xffffffffull;
18591 break;
18592 default:
18593 return false;
18594 break;
18595 }
18596
18597 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18598 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18599 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18600
18601 return false;
18602}
18603
18605 SDValue LeafOp = SDValue(N, 0);
18606 SDValue Op = N->getOperand(0);
18607 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18608 LeafOp.getValueType() != Op.getValueType())
18609 Op = Op->getOperand(0);
18610 if (LeafOp.getValueType() == Op.getValueType())
18611 return Op;
18612 return SDValue();
18613}
18614
18617 SelectionDAG &DAG = DCI.DAG;
18618 SDValue Src = N->getOperand(0);
18619 unsigned Opc = Src->getOpcode();
18620
18621 // Zero/any extend of an unsigned unpack
18622 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18623 SDValue UnpkOp = Src->getOperand(0);
18624 SDValue Dup = N->getOperand(1);
18625
18626 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18627 return SDValue();
18628
18629 SDLoc DL(N);
18630 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18631 if (!C)
18632 return SDValue();
18633
18634 uint64_t ExtVal = C->getZExtValue();
18635
18636 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18637 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18638 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18639 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18640 };
18641
18642 // If the mask is fully covered by the unpack, we don't need to push
18643 // a new AND onto the operand
18644 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18645 if (MaskAndTypeMatch(EltTy))
18646 return Src;
18647
18648 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18649 // to see if the mask is all-ones of size MemTy.
18650 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18651 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18652 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18653 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18654 if (MaskAndTypeMatch(EltTy))
18655 return Src;
18656 }
18657
18658 // Truncate to prevent a DUP with an over wide constant
18659 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18660
18661 // Otherwise, make sure we propagate the AND to the operand
18662 // of the unpack
18663 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18664 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18665
18666 SDValue And = DAG.getNode(ISD::AND, DL,
18667 UnpkOp->getValueType(0), UnpkOp, Dup);
18668
18669 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18670 }
18671
18672 if (DCI.isBeforeLegalizeOps())
18673 return SDValue();
18674
18675 // If both sides of AND operations are i1 splat_vectors then
18676 // we can produce just i1 splat_vector as the result.
18677 if (isAllActivePredicate(DAG, N->getOperand(0)))
18678 return N->getOperand(1);
18679 if (isAllActivePredicate(DAG, N->getOperand(1)))
18680 return N->getOperand(0);
18681
18683 return SDValue();
18684
18685 SDValue Mask = N->getOperand(1);
18686
18687 if (!Src.hasOneUse())
18688 return SDValue();
18689
18690 EVT MemVT;
18691
18692 // SVE load instructions perform an implicit zero-extend, which makes them
18693 // perfect candidates for combining.
18694 switch (Opc) {
18698 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18699 break;
18715 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18716 break;
18717 default:
18718 return SDValue();
18719 }
18720
18721 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18722 return Src;
18723
18724 return SDValue();
18725}
18726
18727// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18730
18731 // This function performs an optimization on a specific pattern involving
18732 // an AND operation and SETCC (Set Condition Code) node.
18733
18734 SDValue SetCC = N->getOperand(0);
18735 EVT VT = N->getValueType(0);
18736 SelectionDAG &DAG = DCI.DAG;
18737
18738 // Checks if the current node (N) is used by any SELECT instruction and
18739 // returns an empty SDValue to avoid applying the optimization to prevent
18740 // incorrect results
18741 for (auto U : N->uses())
18742 if (U->getOpcode() == ISD::SELECT)
18743 return SDValue();
18744
18745 // Check if the operand is a SETCC node with floating-point comparison
18746 if (SetCC.getOpcode() == ISD::SETCC &&
18747 SetCC.getOperand(0).getValueType() == MVT::f32) {
18748
18749 SDValue Cmp;
18751
18752 // Check if the DAG is after legalization and if we can emit the conjunction
18753 if (!DCI.isBeforeLegalize() &&
18754 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18755
18757
18758 SDLoc DL(N);
18759 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18760 DAG.getConstant(0, DL, VT),
18761 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18762 }
18763 }
18764 return SDValue();
18765}
18766
18769 SelectionDAG &DAG = DCI.DAG;
18770 SDValue LHS = N->getOperand(0);
18771 SDValue RHS = N->getOperand(1);
18772 EVT VT = N->getValueType(0);
18773
18774 if (SDValue R = performANDORCSELCombine(N, DAG))
18775 return R;
18776
18777 if (SDValue R = performANDSETCCCombine(N,DCI))
18778 return R;
18779
18780 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18781 return SDValue();
18782
18783 if (VT.isScalableVector())
18784 return performSVEAndCombine(N, DCI);
18785
18786 // The combining code below works only for NEON vectors. In particular, it
18787 // does not work for SVE when dealing with vectors wider than 128 bits.
18788 if (!VT.is64BitVector() && !VT.is128BitVector())
18789 return SDValue();
18790
18791 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18792 if (!BVN)
18793 return SDValue();
18794
18795 // AND does not accept an immediate, so check if we can use a BIC immediate
18796 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18797 // pattern in isel, because some immediates may be lowered to the preferred
18798 // (and x, (movi imm)) form, even though an mvni representation also exists.
18799 APInt DefBits(VT.getSizeInBits(), 0);
18800 APInt UndefBits(VT.getSizeInBits(), 0);
18801 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18802 SDValue NewOp;
18803
18804 // Any bits known to already be 0 need not be cleared again, which can help
18805 // reduce the size of the immediate to one supported by the instruction.
18806 KnownBits Known = DAG.computeKnownBits(LHS);
18807 APInt ZeroSplat(VT.getSizeInBits(), 0);
18808 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18809 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18810 << (Known.Zero.getBitWidth() * I);
18811
18812 DefBits = ~(DefBits | ZeroSplat);
18813 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18814 DefBits, &LHS)) ||
18815 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18816 DefBits, &LHS)))
18817 return NewOp;
18818
18819 UndefBits = ~(UndefBits | ZeroSplat);
18820 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18821 UndefBits, &LHS)) ||
18822 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18823 UndefBits, &LHS)))
18824 return NewOp;
18825 }
18826
18827 return SDValue();
18828}
18829
18832 SelectionDAG &DAG = DCI.DAG;
18833 SDValue LHS = N->getOperand(0);
18834 SDValue RHS = N->getOperand(1);
18835 EVT VT = N->getValueType(0);
18836 SDLoc DL(N);
18837
18838 if (!N->getFlags().hasAllowReassociation())
18839 return SDValue();
18840
18841 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18842 auto ReassocComplex = [&](SDValue A, SDValue B) {
18843 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18844 return SDValue();
18845 unsigned Opc = A.getConstantOperandVal(0);
18846 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18847 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18848 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18849 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18850 return SDValue();
18851 SDValue VCMLA = DAG.getNode(
18852 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18853 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18854 A.getOperand(2), A.getOperand(3));
18855 VCMLA->setFlags(A->getFlags());
18856 return VCMLA;
18857 };
18858 if (SDValue R = ReassocComplex(LHS, RHS))
18859 return R;
18860 if (SDValue R = ReassocComplex(RHS, LHS))
18861 return R;
18862
18863 return SDValue();
18864}
18865
18866static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18867 switch (Opcode) {
18868 case ISD::STRICT_FADD:
18869 case ISD::FADD:
18870 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18871 case ISD::ADD:
18872 return VT == MVT::i64;
18873 default:
18874 return false;
18875 }
18876}
18877
18878static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18880
18882 if ((N.getOpcode() == ISD::SETCC) ||
18883 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18884 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18885 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18886 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18887 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18888 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18889 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18890 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18891 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18892 // get_active_lane_mask is lowered to a whilelo instruction.
18893 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18894 return true;
18895
18896 return false;
18897}
18898
18899// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18900// ... into: "ptrue p, all" + PTEST
18901static SDValue
18904 const AArch64Subtarget *Subtarget) {
18905 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18906 // Make sure PTEST can be legalised with illegal types.
18907 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18908 return SDValue();
18909
18910 SDValue N0 = N->getOperand(0);
18911 EVT VT = N0.getValueType();
18912
18913 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18914 !isNullConstant(N->getOperand(1)))
18915 return SDValue();
18916
18917 // Restricted the DAG combine to only cases where we're extracting from a
18918 // flag-setting operation.
18919 if (!isPredicateCCSettingOp(N0))
18920 return SDValue();
18921
18922 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18923 SelectionDAG &DAG = DCI.DAG;
18924 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18925 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18926}
18927
18928// Materialize : Idx = (add (mul vscale, NumEls), -1)
18929// i1 = extract_vector_elt t37, Constant:i64<Idx>
18930// ... into: "ptrue p, all" + PTEST
18931static SDValue
18934 const AArch64Subtarget *Subtarget) {
18935 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18936 // Make sure PTEST is legal types.
18937 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18938 return SDValue();
18939
18940 SDValue N0 = N->getOperand(0);
18941 EVT OpVT = N0.getValueType();
18942
18943 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18944 return SDValue();
18945
18946 // Idx == (add (mul vscale, NumEls), -1)
18947 SDValue Idx = N->getOperand(1);
18948 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18949 return SDValue();
18950
18951 SDValue VS = Idx.getOperand(0);
18952 if (VS.getOpcode() != ISD::VSCALE)
18953 return SDValue();
18954
18955 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18956 if (VS.getConstantOperandVal(0) != NumEls)
18957 return SDValue();
18958
18959 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18960 SelectionDAG &DAG = DCI.DAG;
18961 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18962 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18963}
18964
18965static SDValue
18967 const AArch64Subtarget *Subtarget) {
18968 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18969 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18970 return Res;
18971 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18972 return Res;
18973
18974 SelectionDAG &DAG = DCI.DAG;
18975 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18976
18977 EVT VT = N->getValueType(0);
18978 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18979 bool IsStrict = N0->isStrictFPOpcode();
18980
18981 // extract(dup x) -> x
18982 if (N0.getOpcode() == AArch64ISD::DUP)
18983 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18984 : N0.getOperand(0);
18985
18986 // Rewrite for pairwise fadd pattern
18987 // (f32 (extract_vector_elt
18988 // (fadd (vXf32 Other)
18989 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18990 // ->
18991 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18992 // (extract_vector_elt (vXf32 Other) 1))
18993 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18994 // we can only do this when it's used only by the extract_vector_elt.
18995 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18996 (!IsStrict || N0.hasOneUse())) {
18997 SDLoc DL(N0);
18998 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18999 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19000
19001 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19002 SDValue Other = N00;
19003
19004 // And handle the commutative case.
19005 if (!Shuffle) {
19006 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19007 Other = N01;
19008 }
19009
19010 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19011 Other == Shuffle->getOperand(0)) {
19012 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19013 DAG.getConstant(0, DL, MVT::i64));
19014 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19015 DAG.getConstant(1, DL, MVT::i64));
19016 if (!IsStrict)
19017 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19018
19019 // For strict_fadd we need uses of the final extract_vector to be replaced
19020 // with the strict_fadd, but we also need uses of the chain output of the
19021 // original strict_fadd to use the chain output of the new strict_fadd as
19022 // otherwise it may not be deleted.
19023 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
19024 {VT, MVT::Other},
19025 {N0->getOperand(0), Extract1, Extract2});
19026 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
19027 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
19028 return SDValue(N, 0);
19029 }
19030 }
19031
19032 return SDValue();
19033}
19034
19037 SelectionDAG &DAG) {
19038 SDLoc dl(N);
19039 EVT VT = N->getValueType(0);
19040 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19041 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19042
19043 if (VT.isScalableVector())
19044 return SDValue();
19045
19046 // Optimize concat_vectors of truncated vectors, where the intermediate
19047 // type is illegal, to avoid said illegality, e.g.,
19048 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19049 // (v2i16 (truncate (v2i64)))))
19050 // ->
19051 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19052 // (v4i32 (bitcast (v2i64))),
19053 // <0, 2, 4, 6>)))
19054 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19055 // on both input and result type, so we might generate worse code.
19056 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19057 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19058 N1Opc == ISD::TRUNCATE) {
19059 SDValue N00 = N0->getOperand(0);
19060 SDValue N10 = N1->getOperand(0);
19061 EVT N00VT = N00.getValueType();
19062
19063 if (N00VT == N10.getValueType() &&
19064 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19065 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19066 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19068 for (size_t i = 0; i < Mask.size(); ++i)
19069 Mask[i] = i * 2;
19070 return DAG.getNode(ISD::TRUNCATE, dl, VT,
19071 DAG.getVectorShuffle(
19072 MidVT, dl,
19073 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
19074 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
19075 }
19076 }
19077
19078 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
19079 N->getOperand(0).getValueType() == MVT::v2i16 ||
19080 N->getOperand(0).getValueType() == MVT::v2i8) {
19081 EVT SrcVT = N->getOperand(0).getValueType();
19082 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19083 // loads to prevent having to go through the v4i8 load legalization that
19084 // needs to extend each element into a larger type.
19085 if (N->getNumOperands() % 2 == 0 &&
19086 all_of(N->op_values(), [SrcVT](SDValue V) {
19087 if (V.getValueType() != SrcVT)
19088 return false;
19089 if (V.isUndef())
19090 return true;
19091 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
19092 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19093 LD->getExtensionType() == ISD::NON_EXTLOAD;
19094 })) {
19095 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19096 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
19098
19099 for (unsigned i = 0; i < N->getNumOperands(); i++) {
19100 SDValue V = N->getOperand(i);
19101 if (V.isUndef())
19102 Ops.push_back(DAG.getUNDEF(FVT));
19103 else {
19104 LoadSDNode *LD = cast<LoadSDNode>(V);
19105 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
19106 LD->getBasePtr(), LD->getMemOperand());
19107 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
19108 Ops.push_back(NewLoad);
19109 }
19110 }
19111 return DAG.getBitcast(N->getValueType(0),
19112 DAG.getBuildVector(NVT, dl, Ops));
19113 }
19114 }
19115
19116 // Canonicalise concat_vectors to replace concatenations of truncated nots
19117 // with nots of concatenated truncates. This in some cases allows for multiple
19118 // redundant negations to be eliminated.
19119 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
19120 // (v4i16 (truncate (not (v4i32)))))
19121 // ->
19122 // (not (concat_vectors (v4i16 (truncate (v4i32))),
19123 // (v4i16 (truncate (v4i32)))))
19124 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19125 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
19126 N->isOnlyUserOf(N1.getNode())) {
19127 auto isBitwiseVectorNegate = [](SDValue V) {
19128 return V->getOpcode() == ISD::XOR &&
19129 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
19130 };
19131 SDValue N00 = N0->getOperand(0);
19132 SDValue N10 = N1->getOperand(0);
19133 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
19134 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
19135 return DAG.getNOT(
19136 dl,
19137 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19138 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
19139 N00->getOperand(0)),
19140 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
19141 N10->getOperand(0))),
19142 VT);
19143 }
19144 }
19145
19146 // Wait till after everything is legalized to try this. That way we have
19147 // legal vector types and such.
19148 if (DCI.isBeforeLegalizeOps())
19149 return SDValue();
19150
19151 // Optimise concat_vectors of two identical binops with a 128-bit destination
19152 // size, combine into an binop of two contacts of the source vectors. eg:
19153 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19154 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
19155 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
19156 N1->hasOneUse()) {
19157 SDValue N00 = N0->getOperand(0);
19158 SDValue N01 = N0->getOperand(1);
19159 SDValue N10 = N1->getOperand(0);
19160 SDValue N11 = N1->getOperand(1);
19161
19162 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19163 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
19164 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
19165 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
19166 }
19167 }
19168
19169 auto IsRSHRN = [](SDValue Shr) {
19170 if (Shr.getOpcode() != AArch64ISD::VLSHR)
19171 return false;
19172 SDValue Op = Shr.getOperand(0);
19173 EVT VT = Op.getValueType();
19174 unsigned ShtAmt = Shr.getConstantOperandVal(1);
19175 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
19176 return false;
19177
19178 APInt Imm;
19179 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
19180 Imm = APInt(VT.getScalarSizeInBits(),
19181 Op.getOperand(1).getConstantOperandVal(0)
19182 << Op.getOperand(1).getConstantOperandVal(1));
19183 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
19184 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
19185 Imm = APInt(VT.getScalarSizeInBits(),
19186 Op.getOperand(1).getConstantOperandVal(0));
19187 else
19188 return false;
19189
19190 if (Imm != 1ULL << (ShtAmt - 1))
19191 return false;
19192 return true;
19193 };
19194
19195 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
19196 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
19197 ((IsRSHRN(N1) &&
19199 N1.isUndef())) {
19200 SDValue X = N0.getOperand(0).getOperand(0);
19201 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
19202 : N1.getOperand(0).getOperand(0);
19203 EVT BVT =
19204 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
19205 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
19206 SDValue Add = DAG.getNode(
19207 ISD::ADD, dl, BVT, CC,
19208 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
19209 SDValue Shr =
19210 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
19211 return Shr;
19212 }
19213
19214 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
19215 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
19216 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
19217 N0.getOperand(1) == N1.getOperand(1)) {
19218 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
19219 DAG.getUNDEF(N0.getValueType()));
19220 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
19221 DAG.getUNDEF(N0.getValueType()));
19222 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
19223 }
19224
19225 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
19226 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
19227 // canonicalise to that.
19228 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
19229 assert(VT.getScalarSizeInBits() == 64);
19230 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
19231 DAG.getConstant(0, dl, MVT::i64));
19232 }
19233
19234 // Canonicalise concat_vectors so that the right-hand vector has as few
19235 // bit-casts as possible before its real operation. The primary matching
19236 // destination for these operations will be the narrowing "2" instructions,
19237 // which depend on the operation being performed on this right-hand vector.
19238 // For example,
19239 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
19240 // becomes
19241 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
19242
19243 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
19244 return SDValue();
19245 SDValue RHS = N1->getOperand(0);
19246 MVT RHSTy = RHS.getValueType().getSimpleVT();
19247 // If the RHS is not a vector, this is not the pattern we're looking for.
19248 if (!RHSTy.isVector())
19249 return SDValue();
19250
19251 LLVM_DEBUG(
19252 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
19253
19254 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
19255 RHSTy.getVectorNumElements() * 2);
19256 return DAG.getNode(ISD::BITCAST, dl, VT,
19257 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
19258 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
19259 RHS));
19260}
19261
19262static SDValue
19264 SelectionDAG &DAG) {
19265 if (DCI.isBeforeLegalizeOps())
19266 return SDValue();
19267
19268 EVT VT = N->getValueType(0);
19269 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
19270 return SDValue();
19271
19272 SDValue V = N->getOperand(0);
19273
19274 // NOTE: This combine exists in DAGCombiner, but that version's legality check
19275 // blocks this combine because the non-const case requires custom lowering.
19276 //
19277 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
19278 if (V.getOpcode() == ISD::SPLAT_VECTOR)
19279 if (isa<ConstantSDNode>(V.getOperand(0)))
19280 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
19281
19282 return SDValue();
19283}
19284
19285static SDValue
19287 SelectionDAG &DAG) {
19288 SDLoc DL(N);
19289 SDValue Vec = N->getOperand(0);
19290 SDValue SubVec = N->getOperand(1);
19291 uint64_t IdxVal = N->getConstantOperandVal(2);
19292 EVT VecVT = Vec.getValueType();
19293 EVT SubVT = SubVec.getValueType();
19294
19295 // Only do this for legal fixed vector types.
19296 if (!VecVT.isFixedLengthVector() ||
19297 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
19298 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
19299 return SDValue();
19300
19301 // Ignore widening patterns.
19302 if (IdxVal == 0 && Vec.isUndef())
19303 return SDValue();
19304
19305 // Subvector must be half the width and an "aligned" insertion.
19306 unsigned NumSubElts = SubVT.getVectorNumElements();
19307 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
19308 (IdxVal != 0 && IdxVal != NumSubElts))
19309 return SDValue();
19310
19311 // Fold insert_subvector -> concat_vectors
19312 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
19313 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
19314 SDValue Lo, Hi;
19315 if (IdxVal == 0) {
19316 Lo = SubVec;
19317 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
19318 DAG.getVectorIdxConstant(NumSubElts, DL));
19319 } else {
19320 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
19321 DAG.getVectorIdxConstant(0, DL));
19322 Hi = SubVec;
19323 }
19324 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
19325}
19326
19329 SelectionDAG &DAG) {
19330 // Wait until after everything is legalized to try this. That way we have
19331 // legal vector types and such.
19332 if (DCI.isBeforeLegalizeOps())
19333 return SDValue();
19334 // Transform a scalar conversion of a value from a lane extract into a
19335 // lane extract of a vector conversion. E.g., from foo1 to foo2:
19336 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
19337 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
19338 //
19339 // The second form interacts better with instruction selection and the
19340 // register allocator to avoid cross-class register copies that aren't
19341 // coalescable due to a lane reference.
19342
19343 // Check the operand and see if it originates from a lane extract.
19344 SDValue Op1 = N->getOperand(1);
19346 return SDValue();
19347
19348 // Yep, no additional predication needed. Perform the transform.
19349 SDValue IID = N->getOperand(0);
19350 SDValue Shift = N->getOperand(2);
19351 SDValue Vec = Op1.getOperand(0);
19352 SDValue Lane = Op1.getOperand(1);
19353 EVT ResTy = N->getValueType(0);
19354 EVT VecResTy;
19355 SDLoc DL(N);
19356
19357 // The vector width should be 128 bits by the time we get here, even
19358 // if it started as 64 bits (the extract_vector handling will have
19359 // done so). Bail if it is not.
19360 if (Vec.getValueSizeInBits() != 128)
19361 return SDValue();
19362
19363 if (Vec.getValueType() == MVT::v4i32)
19364 VecResTy = MVT::v4f32;
19365 else if (Vec.getValueType() == MVT::v2i64)
19366 VecResTy = MVT::v2f64;
19367 else
19368 return SDValue();
19369
19370 SDValue Convert =
19371 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
19372 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
19373}
19374
19375// AArch64 high-vector "long" operations are formed by performing the non-high
19376// version on an extract_subvector of each operand which gets the high half:
19377//
19378// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
19379//
19380// However, there are cases which don't have an extract_high explicitly, but
19381// have another operation that can be made compatible with one for free. For
19382// example:
19383//
19384// (dupv64 scalar) --> (extract_high (dup128 scalar))
19385//
19386// This routine does the actual conversion of such DUPs, once outer routines
19387// have determined that everything else is in order.
19388// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
19389// similarly here.
19391 MVT VT = N.getSimpleValueType();
19392 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
19393 N.getConstantOperandVal(1) == 0)
19394 N = N.getOperand(0);
19395
19396 switch (N.getOpcode()) {
19397 case AArch64ISD::DUP:
19402 case AArch64ISD::MOVI:
19408 break;
19409 default:
19410 // FMOV could be supported, but isn't very useful, as it would only occur
19411 // if you passed a bitcast' floating point immediate to an eligible long
19412 // integer op (addl, smull, ...).
19413 return SDValue();
19414 }
19415
19416 if (!VT.is64BitVector())
19417 return SDValue();
19418
19419 SDLoc DL(N);
19420 unsigned NumElems = VT.getVectorNumElements();
19421 if (N.getValueType().is64BitVector()) {
19422 MVT ElementTy = VT.getVectorElementType();
19423 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19424 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19425 }
19426
19427 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19428 DAG.getConstant(NumElems, DL, MVT::i64));
19429}
19430
19432 if (N.getOpcode() == ISD::BITCAST)
19433 N = N.getOperand(0);
19434 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19435 return false;
19436 if (N.getOperand(0).getValueType().isScalableVector())
19437 return false;
19438 return N.getConstantOperandAPInt(1) ==
19439 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19440}
19441
19442/// Helper structure to keep track of ISD::SET_CC operands.
19447};
19448
19449/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19451 const SDValue *Cmp;
19453};
19454
19455/// Helper structure to keep track of SetCC information.
19459};
19460
19461/// Helper structure to be able to read SetCC information. If set to
19462/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19463/// GenericSetCCInfo.
19467};
19468
19469/// Check whether or not \p Op is a SET_CC operation, either a generic or
19470/// an
19471/// AArch64 lowered one.
19472/// \p SetCCInfo is filled accordingly.
19473/// \post SetCCInfo is meanginfull only when this function returns true.
19474/// \return True when Op is a kind of SET_CC operation.
19476 // If this is a setcc, this is straight forward.
19477 if (Op.getOpcode() == ISD::SETCC) {
19478 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19479 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19480 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19481 SetCCInfo.IsAArch64 = false;
19482 return true;
19483 }
19484 // Otherwise, check if this is a matching csel instruction.
19485 // In other words:
19486 // - csel 1, 0, cc
19487 // - csel 0, 1, !cc
19488 if (Op.getOpcode() != AArch64ISD::CSEL)
19489 return false;
19490 // Set the information about the operands.
19491 // TODO: we want the operands of the Cmp not the csel
19492 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19493 SetCCInfo.IsAArch64 = true;
19494 SetCCInfo.Info.AArch64.CC =
19495 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19496
19497 // Check that the operands matches the constraints:
19498 // (1) Both operands must be constants.
19499 // (2) One must be 1 and the other must be 0.
19500 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19501 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19502
19503 // Check (1).
19504 if (!TValue || !FValue)
19505 return false;
19506
19507 // Check (2).
19508 if (!TValue->isOne()) {
19509 // Update the comparison when we are interested in !cc.
19510 std::swap(TValue, FValue);
19511 SetCCInfo.Info.AArch64.CC =
19513 }
19514 return TValue->isOne() && FValue->isZero();
19515}
19516
19517// Returns true if Op is setcc or zext of setcc.
19518static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19519 if (isSetCC(Op, Info))
19520 return true;
19521 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19522 isSetCC(Op->getOperand(0), Info));
19523}
19524
19525// The folding we want to perform is:
19526// (add x, [zext] (setcc cc ...) )
19527// -->
19528// (csel x, (add x, 1), !cc ...)
19529//
19530// The latter will get matched to a CSINC instruction.
19532 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19533 SDValue LHS = Op->getOperand(0);
19534 SDValue RHS = Op->getOperand(1);
19535 SetCCInfoAndKind InfoAndKind;
19536
19537 // If both operands are a SET_CC, then we don't want to perform this
19538 // folding and create another csel as this results in more instructions
19539 // (and higher register usage).
19540 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19541 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19542 return SDValue();
19543
19544 // If neither operand is a SET_CC, give up.
19545 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19546 std::swap(LHS, RHS);
19547 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19548 return SDValue();
19549 }
19550
19551 // FIXME: This could be generatized to work for FP comparisons.
19552 EVT CmpVT = InfoAndKind.IsAArch64
19553 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19554 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19555 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19556 return SDValue();
19557
19558 SDValue CCVal;
19559 SDValue Cmp;
19560 SDLoc dl(Op);
19561 if (InfoAndKind.IsAArch64) {
19562 CCVal = DAG.getConstant(
19564 MVT::i32);
19565 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19566 } else
19567 Cmp = getAArch64Cmp(
19568 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19569 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19570 dl);
19571
19572 EVT VT = Op->getValueType(0);
19573 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19574 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19575}
19576
19577// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19579 EVT VT = N->getValueType(0);
19580 // Only scalar integer and vector types.
19581 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19582 return SDValue();
19583
19584 SDValue LHS = N->getOperand(0);
19585 SDValue RHS = N->getOperand(1);
19586 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19587 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19588 return SDValue();
19589
19590 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19591 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19592 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19593 return SDValue();
19594
19595 SDValue Op1 = LHS->getOperand(0);
19596 SDValue Op2 = RHS->getOperand(0);
19597 EVT OpVT1 = Op1.getValueType();
19598 EVT OpVT2 = Op2.getValueType();
19599 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19600 Op2.getOpcode() != AArch64ISD::UADDV ||
19601 OpVT1.getVectorElementType() != VT)
19602 return SDValue();
19603
19604 SDValue Val1 = Op1.getOperand(0);
19605 SDValue Val2 = Op2.getOperand(0);
19606 EVT ValVT = Val1->getValueType(0);
19607 SDLoc DL(N);
19608 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19609 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19610 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19611 DAG.getConstant(0, DL, MVT::i64));
19612}
19613
19614/// Perform the scalar expression combine in the form of:
19615/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19616/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19618 EVT VT = N->getValueType(0);
19619 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19620 return SDValue();
19621
19622 SDValue LHS = N->getOperand(0);
19623 SDValue RHS = N->getOperand(1);
19624
19625 // Handle commutivity.
19626 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19627 LHS.getOpcode() != AArch64ISD::CSNEG) {
19628 std::swap(LHS, RHS);
19629 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19630 LHS.getOpcode() != AArch64ISD::CSNEG) {
19631 return SDValue();
19632 }
19633 }
19634
19635 if (!LHS.hasOneUse())
19636 return SDValue();
19637
19638 AArch64CC::CondCode AArch64CC =
19639 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19640
19641 // The CSEL should include a const one operand, and the CSNEG should include
19642 // One or NegOne operand.
19643 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19644 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19645 if (!CTVal || !CFVal)
19646 return SDValue();
19647
19648 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19649 (CTVal->isOne() || CFVal->isOne())) &&
19650 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19651 (CTVal->isOne() || CFVal->isAllOnes())))
19652 return SDValue();
19653
19654 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19655 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19656 !CFVal->isOne()) {
19657 std::swap(CTVal, CFVal);
19658 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19659 }
19660
19661 SDLoc DL(N);
19662 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19663 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19664 !CFVal->isAllOnes()) {
19665 APInt C = -1 * CFVal->getAPIntValue();
19666 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19667 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19668 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19669 }
19670
19671 // It might be neutral for larger constants, as the immediate need to be
19672 // materialized in a register.
19673 APInt ADDC = CTVal->getAPIntValue();
19674 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19675 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19676 return SDValue();
19677
19678 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19679 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19680 "Unexpected constant value");
19681
19682 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19683 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19684 SDValue Cmp = LHS.getOperand(3);
19685
19686 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19687}
19688
19689// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19691 EVT VT = N->getValueType(0);
19692 if (N->getOpcode() != ISD::ADD)
19693 return SDValue();
19694
19695 SDValue Dot = N->getOperand(0);
19696 SDValue A = N->getOperand(1);
19697 // Handle commutivity
19698 auto isZeroDot = [](SDValue Dot) {
19699 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19700 Dot.getOpcode() == AArch64ISD::SDOT) &&
19702 };
19703 if (!isZeroDot(Dot))
19704 std::swap(Dot, A);
19705 if (!isZeroDot(Dot))
19706 return SDValue();
19707
19708 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19709 Dot.getOperand(2));
19710}
19711
19713 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19714}
19715
19717 SDLoc DL(Op);
19718 EVT VT = Op.getValueType();
19719 SDValue Zero = DAG.getConstant(0, DL, VT);
19720 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19721}
19722
19723// Try to fold
19724//
19725// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19726//
19727// The folding helps csel to be matched with csneg without generating
19728// redundant neg instruction, which includes negation of the csel expansion
19729// of abs node lowered by lowerABS.
19731 if (!isNegatedInteger(SDValue(N, 0)))
19732 return SDValue();
19733
19734 SDValue CSel = N->getOperand(1);
19735 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19736 return SDValue();
19737
19738 SDValue N0 = CSel.getOperand(0);
19739 SDValue N1 = CSel.getOperand(1);
19740
19741 // If both of them is not negations, it's not worth the folding as it
19742 // introduces two additional negations while reducing one negation.
19743 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19744 return SDValue();
19745
19746 SDValue N0N = getNegatedInteger(N0, DAG);
19747 SDValue N1N = getNegatedInteger(N1, DAG);
19748
19749 SDLoc DL(N);
19750 EVT VT = CSel.getValueType();
19751 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19752 CSel.getOperand(3));
19753}
19754
19755// The basic add/sub long vector instructions have variants with "2" on the end
19756// which act on the high-half of their inputs. They are normally matched by
19757// patterns like:
19758//
19759// (add (zeroext (extract_high LHS)),
19760// (zeroext (extract_high RHS)))
19761// -> uaddl2 vD, vN, vM
19762//
19763// However, if one of the extracts is something like a duplicate, this
19764// instruction can still be used profitably. This function puts the DAG into a
19765// more appropriate form for those patterns to trigger.
19768 SelectionDAG &DAG = DCI.DAG;
19769 if (DCI.isBeforeLegalizeOps())
19770 return SDValue();
19771
19772 MVT VT = N->getSimpleValueType(0);
19773 if (!VT.is128BitVector()) {
19774 if (N->getOpcode() == ISD::ADD)
19775 return performSetccAddFolding(N, DAG);
19776 return SDValue();
19777 }
19778
19779 // Make sure both branches are extended in the same way.
19780 SDValue LHS = N->getOperand(0);
19781 SDValue RHS = N->getOperand(1);
19782 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19783 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19784 LHS.getOpcode() != RHS.getOpcode())
19785 return SDValue();
19786
19787 unsigned ExtType = LHS.getOpcode();
19788
19789 // It's not worth doing if at least one of the inputs isn't already an
19790 // extract, but we don't know which it'll be so we have to try both.
19791 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19792 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19793 if (!RHS.getNode())
19794 return SDValue();
19795
19796 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19797 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19798 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19799 if (!LHS.getNode())
19800 return SDValue();
19801
19802 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19803 }
19804
19805 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19806}
19807
19808static bool isCMP(SDValue Op) {
19809 return Op.getOpcode() == AArch64ISD::SUBS &&
19810 !Op.getNode()->hasAnyUseOfValue(0);
19811}
19812
19813// (CSEL 1 0 CC Cond) => CC
19814// (CSEL 0 1 CC Cond) => !CC
19815static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19816 if (Op.getOpcode() != AArch64ISD::CSEL)
19817 return std::nullopt;
19818 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19819 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19820 return std::nullopt;
19821 SDValue OpLHS = Op.getOperand(0);
19822 SDValue OpRHS = Op.getOperand(1);
19823 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19824 return CC;
19825 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19826 return getInvertedCondCode(CC);
19827
19828 return std::nullopt;
19829}
19830
19831// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19832// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19833static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19834 SDValue CmpOp = Op->getOperand(2);
19835 if (!isCMP(CmpOp))
19836 return SDValue();
19837
19838 if (IsAdd) {
19839 if (!isOneConstant(CmpOp.getOperand(1)))
19840 return SDValue();
19841 } else {
19842 if (!isNullConstant(CmpOp.getOperand(0)))
19843 return SDValue();
19844 }
19845
19846 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19847 auto CC = getCSETCondCode(CsetOp);
19848 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19849 return SDValue();
19850
19851 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19852 Op->getOperand(0), Op->getOperand(1),
19853 CsetOp.getOperand(3));
19854}
19855
19856// (ADC x 0 cond) => (CINC x HS cond)
19858 SDValue LHS = N->getOperand(0);
19859 SDValue RHS = N->getOperand(1);
19860 SDValue Cond = N->getOperand(2);
19861
19862 if (!isNullConstant(RHS))
19863 return SDValue();
19864
19865 EVT VT = N->getValueType(0);
19866 SDLoc DL(N);
19867
19868 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19869 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19870 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19871}
19872
19875 SelectionDAG &DAG) {
19876 SDLoc DL(N);
19877 EVT VT = N->getValueType(0);
19878
19880 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
19881 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19882 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19883 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19884 Elt1->getOpcode() == ISD::FP_ROUND &&
19885 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19886 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19887 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19889 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19890 // Constant index.
19891 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19892 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19893 Elt0->getOperand(0)->getOperand(0) ==
19894 Elt1->getOperand(0)->getOperand(0) &&
19895 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19896 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19897 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19898 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19899 SDValue HighLanes;
19900 if (Elt2->getOpcode() == ISD::UNDEF &&
19901 Elt3->getOpcode() == ISD::UNDEF) {
19902 HighLanes = DAG.getUNDEF(MVT::v2f32);
19903 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19904 Elt3->getOpcode() == ISD::FP_ROUND &&
19905 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19906 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19907 Elt2->getConstantOperandVal(1) ==
19908 Elt3->getConstantOperandVal(1) &&
19909 Elt2->getOperand(0)->getOpcode() ==
19911 Elt3->getOperand(0)->getOpcode() ==
19913 // Constant index.
19914 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19915 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19916 Elt2->getOperand(0)->getOperand(0) ==
19917 Elt3->getOperand(0)->getOperand(0) &&
19918 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19919 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19920 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19921 HighLanes =
19922 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19923 }
19924 if (HighLanes) {
19925 SDValue DoubleToSingleSticky =
19926 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19927 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19928 DoubleToSingleSticky, HighLanes);
19929 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19930 Elt0->getOperand(1));
19931 }
19932 }
19933 }
19934 }
19935
19936 if (VT == MVT::v2f64) {
19937 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19938 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19939 Elt1->getOpcode() == ISD::FP_EXTEND &&
19941 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19942 Elt0->getOperand(0)->getOperand(0) ==
19943 Elt1->getOperand(0)->getOperand(0) &&
19944 // Constant index.
19945 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19946 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19947 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19948 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19949 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19950 // ResultType's known minimum vector length.
19951 Elt0->getOperand(0)->getConstantOperandVal(1) %
19953 0) {
19954 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19955 if (SrcVec.getValueType() == MVT::v4f16 ||
19956 SrcVec.getValueType() == MVT::v4bf16) {
19957 SDValue HalfToSingle =
19958 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19959 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19960 SDValue Extract = DAG.getNode(
19962 HalfToSingle, SubvectorIdx);
19963 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19964 }
19965 }
19966 }
19967
19968 // A build vector of two extracted elements is equivalent to an
19969 // extract subvector where the inner vector is any-extended to the
19970 // extract_vector_elt VT.
19971 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19972 // (extract_elt_iXX_to_i32 vec Idx+1))
19973 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19974
19975 // For now, only consider the v2i32 case, which arises as a result of
19976 // legalization.
19977 if (VT != MVT::v2i32)
19978 return SDValue();
19979
19980 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19981 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19982 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19983 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19984 // Constant index.
19985 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19986 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19987 // Both EXTRACT_VECTOR_ELT from same vector...
19988 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19989 // ... and contiguous. First element's index +1 == second element's index.
19990 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19991 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19992 // ResultType's known minimum vector length.
19993 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19994 SDValue VecToExtend = Elt0->getOperand(0);
19995 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19996 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19997 return SDValue();
19998
19999 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
20000
20001 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
20002 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
20003 SubvectorIdx);
20004 }
20005
20006 return SDValue();
20007}
20008
20010 SelectionDAG &DAG) {
20011 EVT VT = N->getValueType(0);
20012 SDValue N0 = N->getOperand(0);
20013 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20014 N0.getOpcode() == AArch64ISD::DUP) {
20015 SDValue Op = N0.getOperand(0);
20016 if (VT.getScalarType() == MVT::i32 &&
20017 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20018 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
20019 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
20020 }
20021
20022 return SDValue();
20023}
20024
20025// Check an node is an extend or shift operand
20027 unsigned Opcode = N.getOpcode();
20028 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20029 EVT SrcVT;
20030 if (Opcode == ISD::SIGN_EXTEND_INREG)
20031 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
20032 else
20033 SrcVT = N.getOperand(0).getValueType();
20034
20035 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20036 } else if (Opcode == ISD::AND) {
20037 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
20038 if (!CSD)
20039 return false;
20040 uint64_t AndMask = CSD->getZExtValue();
20041 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20042 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20043 return isa<ConstantSDNode>(N.getOperand(1));
20044 }
20045
20046 return false;
20047}
20048
20049// (N - Y) + Z --> (Z - Y) + N
20050// when N is an extend or shift operand
20052 SelectionDAG &DAG) {
20053 auto IsOneUseExtend = [](SDValue N) {
20054 return N.hasOneUse() && isExtendOrShiftOperand(N);
20055 };
20056
20057 // DAGCombiner will revert the combination when Z is constant cause
20058 // dead loop. So don't enable the combination when Z is constant.
20059 // If Z is one use shift C, we also can't do the optimization.
20060 // It will falling to self infinite loop.
20061 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
20062 return SDValue();
20063
20064 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20065 return SDValue();
20066
20067 SDValue Shift = SUB.getOperand(0);
20068 if (!IsOneUseExtend(Shift))
20069 return SDValue();
20070
20071 SDLoc DL(N);
20072 EVT VT = N->getValueType(0);
20073
20074 SDValue Y = SUB.getOperand(1);
20075 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
20076 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
20077}
20078
20080 SelectionDAG &DAG) {
20081 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20082 // commutative.
20083 if (N->getOpcode() != ISD::ADD)
20084 return SDValue();
20085
20086 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20087 // shifted register is only available for i32 and i64.
20088 EVT VT = N->getValueType(0);
20089 if (VT != MVT::i32 && VT != MVT::i64)
20090 return SDValue();
20091
20092 SDLoc DL(N);
20093 SDValue LHS = N->getOperand(0);
20094 SDValue RHS = N->getOperand(1);
20095
20096 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
20097 return Val;
20098 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
20099 return Val;
20100
20101 uint64_t LHSImm = 0, RHSImm = 0;
20102 // If both operand are shifted by imm and shift amount is not greater than 4
20103 // for one operand, swap LHS and RHS to put operand with smaller shift amount
20104 // on RHS.
20105 //
20106 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20107 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20108 // with LSL (shift > 4). For the rest of processors, this is no-op for
20109 // performance or correctness.
20110 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
20111 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
20112 RHSImm > 4 && LHS.hasOneUse())
20113 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
20114
20115 return SDValue();
20116}
20117
20118// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20119// This reassociates it back to allow the creation of more mls instructions.
20121 if (N->getOpcode() != ISD::SUB)
20122 return SDValue();
20123
20124 SDValue Add = N->getOperand(1);
20125 SDValue X = N->getOperand(0);
20126 if (Add.getOpcode() != ISD::ADD)
20127 return SDValue();
20128
20129 if (!Add.hasOneUse())
20130 return SDValue();
20132 return SDValue();
20133
20134 SDValue M1 = Add.getOperand(0);
20135 SDValue M2 = Add.getOperand(1);
20136 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
20137 M1.getOpcode() != AArch64ISD::UMULL)
20138 return SDValue();
20139 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
20141 return SDValue();
20142
20143 EVT VT = N->getValueType(0);
20144 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
20145 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
20146}
20147
20148// Combine into mla/mls.
20149// This works on the patterns of:
20150// add v1, (mul v2, v3)
20151// sub v1, (mul v2, v3)
20152// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
20153// It will transform the add/sub to a scalable version, so that we can
20154// make use of SVE's MLA/MLS that will be generated for that pattern
20155static SDValue
20157 SelectionDAG &DAG = DCI.DAG;
20158 // Make sure that the types are legal
20159 if (!DCI.isAfterLegalizeDAG())
20160 return SDValue();
20161 // Before using SVE's features, check first if it's available.
20162 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
20163 return SDValue();
20164
20165 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
20166 return SDValue();
20167
20168 if (!N->getValueType(0).isFixedLengthVector())
20169 return SDValue();
20170
20171 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
20172 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20173 return SDValue();
20174
20175 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
20176 return SDValue();
20177
20178 SDValue MulValue = Op1->getOperand(0);
20179 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
20180 return SDValue();
20181
20182 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
20183 return SDValue();
20184
20185 EVT ScalableVT = MulValue.getValueType();
20186 if (!ScalableVT.isScalableVector())
20187 return SDValue();
20188
20189 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
20190 SDValue NewValue =
20191 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
20192 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
20193 };
20194
20195 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
20196 return res;
20197 else if (N->getOpcode() == ISD::ADD)
20198 return performOpt(N->getOperand(1), N->getOperand(0));
20199
20200 return SDValue();
20201}
20202
20203// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
20204// help, for example, to produce ssra from sshr+add.
20206 EVT VT = N->getValueType(0);
20207 if (VT != MVT::i64 ||
20208 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
20209 return SDValue();
20210 SDValue Op0 = N->getOperand(0);
20211 SDValue Op1 = N->getOperand(1);
20212
20213 // At least one of the operands should be an extract, and the other should be
20214 // something that is easy to convert to v1i64 type (in this case a load).
20215 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20216 Op0.getOpcode() != ISD::LOAD)
20217 return SDValue();
20218 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20219 Op1.getOpcode() != ISD::LOAD)
20220 return SDValue();
20221
20222 SDLoc DL(N);
20223 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20224 Op0.getOperand(0).getValueType() == MVT::v1i64) {
20225 Op0 = Op0.getOperand(0);
20226 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
20227 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20228 Op1.getOperand(0).getValueType() == MVT::v1i64) {
20229 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
20230 Op1 = Op1.getOperand(0);
20231 } else
20232 return SDValue();
20233
20234 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
20235 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
20236 DAG.getConstant(0, DL, MVT::i64));
20237}
20238
20241 if (!BV->hasOneUse())
20242 return false;
20243 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
20244 if (!Ld || !Ld->isSimple())
20245 return false;
20246 Loads.push_back(Ld);
20247 return true;
20248 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
20250 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
20251 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
20252 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
20253 return false;
20254 Loads.push_back(Ld);
20255 }
20256 return true;
20257 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
20258 // Try to find a tree of shuffles and concats from how IR shuffles of loads
20259 // are lowered. Note that this only comes up because we do not always visit
20260 // operands before uses. After that is fixed this can be removed and in the
20261 // meantime this is fairly specific to the lowering we expect from IR.
20262 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
20263 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
20264 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
20265 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
20266 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
20267 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
20268 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
20269 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
20270 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
20271 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
20272 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
20273 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
20274 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
20275 B.getOperand(1).getNumOperands() != 4)
20276 return false;
20277 auto SV1 = cast<ShuffleVectorSDNode>(B);
20278 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
20279 int NumElts = B.getValueType().getVectorNumElements();
20280 int NumSubElts = NumElts / 4;
20281 for (int I = 0; I < NumSubElts; I++) {
20282 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
20283 if (SV1->getMaskElt(I) != I ||
20284 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
20285 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
20286 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
20287 return false;
20288 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
20289 if (SV2->getMaskElt(I) != I ||
20290 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
20291 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
20292 return false;
20293 }
20294 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
20295 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
20296 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
20297 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
20298 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
20299 !Ld2->isSimple() || !Ld3->isSimple())
20300 return false;
20301 Loads.push_back(Ld0);
20302 Loads.push_back(Ld1);
20303 Loads.push_back(Ld2);
20304 Loads.push_back(Ld3);
20305 return true;
20306 }
20307 return false;
20308}
20309
20311 SelectionDAG &DAG,
20312 unsigned &NumSubLoads) {
20313 if (!Op0.hasOneUse() || !Op1.hasOneUse())
20314 return false;
20315
20316 SmallVector<LoadSDNode *> Loads0, Loads1;
20317 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20318 isLoadOrMultipleLoads(Op1, Loads1)) {
20319 if (NumSubLoads && Loads0.size() != NumSubLoads)
20320 return false;
20321 NumSubLoads = Loads0.size();
20322 return Loads0.size() == Loads1.size() &&
20323 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
20324 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
20325 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
20326 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
20327 Size / 8, 1);
20328 });
20329 }
20330
20331 if (Op0.getOpcode() != Op1.getOpcode())
20332 return false;
20333
20334 switch (Op0.getOpcode()) {
20335 case ISD::ADD:
20336 case ISD::SUB:
20338 DAG, NumSubLoads) &&
20340 DAG, NumSubLoads);
20341 case ISD::SIGN_EXTEND:
20342 case ISD::ANY_EXTEND:
20343 case ISD::ZERO_EXTEND:
20344 EVT XVT = Op0.getOperand(0).getValueType();
20345 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
20346 XVT.getScalarSizeInBits() != 32)
20347 return false;
20349 DAG, NumSubLoads);
20350 }
20351 return false;
20352}
20353
20354// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
20355// into a single load of twice the size, that we extract the bottom part and top
20356// part so that the shl can use a shll2 instruction. The two loads in that
20357// example can also be larger trees of instructions, which are identical except
20358// for the leaves which are all loads offset from the LHS, including
20359// buildvectors of multiple loads. For example the RHS tree could be
20360// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
20361// Whilst it can be common for the larger loads to replace LDP instructions
20362// (which doesn't gain anything on it's own), the larger loads can help create
20363// more efficient code, and in buildvectors prevent the need for ld1 lane
20364// inserts which can be slower than normal loads.
20366 EVT VT = N->getValueType(0);
20367 if (!VT.isFixedLengthVector() ||
20368 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
20369 VT.getScalarSizeInBits() != 64))
20370 return SDValue();
20371
20372 SDValue Other = N->getOperand(0);
20373 SDValue Shift = N->getOperand(1);
20374 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20375 std::swap(Shift, Other);
20376 APInt ShiftAmt;
20377 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20378 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
20379 return SDValue();
20380
20381 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
20382 !ISD::isExtOpcode(Other.getOpcode()) ||
20383 Shift.getOperand(0).getOperand(0).getValueType() !=
20384 Other.getOperand(0).getValueType() ||
20385 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20386 return SDValue();
20387
20388 SDValue Op0 = Other.getOperand(0);
20389 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20390
20391 unsigned NumSubLoads = 0;
20392 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20393 return SDValue();
20394
20395 // Attempt to rule out some unprofitable cases using heuristics (some working
20396 // around suboptimal code generation), notably if the extend not be able to
20397 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20398 // will need to be created which can increase the instruction count.
20399 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20400 unsigned NumSubElts = NumElts / NumSubLoads;
20401 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20402 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20403 Op0.getValueType().getSizeInBits() < 128 &&
20405 return SDValue();
20406
20407 // Recreate the tree with the new combined loads.
20408 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20409 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20410 EVT DVT =
20412
20413 SmallVector<LoadSDNode *> Loads0, Loads1;
20414 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20415 isLoadOrMultipleLoads(Op1, Loads1)) {
20416 EVT LoadVT = EVT::getVectorVT(
20417 *DAG.getContext(), Op0.getValueType().getScalarType(),
20418 Op0.getValueType().getVectorNumElements() / Loads0.size());
20419 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20420
20421 SmallVector<SDValue> NewLoads;
20422 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20423 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20424 L0->getBasePtr(), L0->getPointerInfo(),
20425 L0->getOriginalAlign());
20426 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20427 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20428 NewLoads.push_back(Load);
20429 }
20430 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20431 }
20432
20434 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20435 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20436 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20437 };
20438 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20439
20440 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20441 int Hi = NumSubElts, Lo = 0;
20442 for (unsigned i = 0; i < NumSubLoads; i++) {
20443 for (unsigned j = 0; j < NumSubElts; j++) {
20444 LowMask[i * NumSubElts + j] = Lo++;
20445 HighMask[i * NumSubElts + j] = Hi++;
20446 }
20447 Lo += NumSubElts;
20448 Hi += NumSubElts;
20449 }
20450 SDLoc DL(N);
20451 SDValue Ext0, Ext1;
20452 // Extract the top and bottom lanes, then extend the result. Possibly extend
20453 // the result then extract the lanes if the two operands match as it produces
20454 // slightly smaller code.
20455 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20457 NewOp, DAG.getConstant(0, DL, MVT::i64));
20458 SDValue SubH =
20459 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20460 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20461 SDValue Extr0 =
20462 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20463 SDValue Extr1 =
20464 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20465 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20466 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20467 } else {
20469 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20470 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20471 DAG.getConstant(0, DL, MVT::i64));
20472 SDValue SubH =
20473 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20474 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20475 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20476 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20477 }
20478 SDValue NShift =
20479 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20480 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20481}
20482
20485 // Try to change sum of two reductions.
20486 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20487 return Val;
20488 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20489 return Val;
20490 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20491 return Val;
20492 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20493 return Val;
20494 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
20495 return Val;
20497 return Val;
20498 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20499 return Val;
20500 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20501 return Val;
20502 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20503 return Val;
20504
20505 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20506 return Val;
20507
20508 return performAddSubLongCombine(N, DCI);
20509}
20510
20511// Massage DAGs which we can use the high-half "long" operations on into
20512// something isel will recognize better. E.g.
20513//
20514// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20515// (aarch64_neon_umull (extract_high (v2i64 vec)))
20516// (extract_high (v2i64 (dup128 scalar)))))
20517//
20520 SelectionDAG &DAG) {
20521 if (DCI.isBeforeLegalizeOps())
20522 return SDValue();
20523
20524 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20525 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20526 assert(LHS.getValueType().is64BitVector() &&
20527 RHS.getValueType().is64BitVector() &&
20528 "unexpected shape for long operation");
20529
20530 // Either node could be a DUP, but it's not worth doing both of them (you'd
20531 // just as well use the non-high version) so look for a corresponding extract
20532 // operation on the other "wing".
20535 if (!RHS.getNode())
20536 return SDValue();
20539 if (!LHS.getNode())
20540 return SDValue();
20541 } else
20542 return SDValue();
20543
20544 if (IID == Intrinsic::not_intrinsic)
20545 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20546
20547 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20548 N->getOperand(0), LHS, RHS);
20549}
20550
20551static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20552 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20553 unsigned ElemBits = ElemTy.getSizeInBits();
20554
20555 int64_t ShiftAmount;
20556 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20557 APInt SplatValue, SplatUndef;
20558 unsigned SplatBitSize;
20559 bool HasAnyUndefs;
20560 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20561 HasAnyUndefs, ElemBits) ||
20562 SplatBitSize != ElemBits)
20563 return SDValue();
20564
20565 ShiftAmount = SplatValue.getSExtValue();
20566 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20567 ShiftAmount = CVN->getSExtValue();
20568 } else
20569 return SDValue();
20570
20571 // If the shift amount is zero, remove the shift intrinsic.
20572 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20573 return N->getOperand(1);
20574
20575 unsigned Opcode;
20576 bool IsRightShift;
20577 switch (IID) {
20578 default:
20579 llvm_unreachable("Unknown shift intrinsic");
20580 case Intrinsic::aarch64_neon_sqshl:
20581 Opcode = AArch64ISD::SQSHL_I;
20582 IsRightShift = false;
20583 break;
20584 case Intrinsic::aarch64_neon_uqshl:
20585 Opcode = AArch64ISD::UQSHL_I;
20586 IsRightShift = false;
20587 break;
20588 case Intrinsic::aarch64_neon_srshl:
20589 Opcode = AArch64ISD::SRSHR_I;
20590 IsRightShift = true;
20591 break;
20592 case Intrinsic::aarch64_neon_urshl:
20593 Opcode = AArch64ISD::URSHR_I;
20594 IsRightShift = true;
20595 break;
20596 case Intrinsic::aarch64_neon_sqshlu:
20597 Opcode = AArch64ISD::SQSHLU_I;
20598 IsRightShift = false;
20599 break;
20600 case Intrinsic::aarch64_neon_sshl:
20601 case Intrinsic::aarch64_neon_ushl:
20602 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20603 // left shift for positive shift amounts. For negative shifts we can use a
20604 // VASHR/VLSHR as appropiate.
20605 if (ShiftAmount < 0) {
20606 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20608 ShiftAmount = -ShiftAmount;
20609 } else
20610 Opcode = AArch64ISD::VSHL;
20611 IsRightShift = false;
20612 break;
20613 }
20614
20615 EVT VT = N->getValueType(0);
20616 SDValue Op = N->getOperand(1);
20617 SDLoc dl(N);
20618 if (VT == MVT::i64) {
20619 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20620 VT = MVT::v1i64;
20621 }
20622
20623 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20624 Op = DAG.getNode(Opcode, dl, VT, Op,
20625 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20626 if (N->getValueType(0) == MVT::i64)
20627 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20628 DAG.getConstant(0, dl, MVT::i64));
20629 return Op;
20630 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20631 Op = DAG.getNode(Opcode, dl, VT, Op,
20632 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20633 if (N->getValueType(0) == MVT::i64)
20634 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20635 DAG.getConstant(0, dl, MVT::i64));
20636 return Op;
20637 }
20638
20639 return SDValue();
20640}
20641
20642// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20643// the intrinsics must be legal and take an i32, this means there's almost
20644// certainly going to be a zext in the DAG which we can eliminate.
20645static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20646 SDValue AndN = N->getOperand(2);
20647 if (AndN.getOpcode() != ISD::AND)
20648 return SDValue();
20649
20650 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20651 if (!CMask || CMask->getZExtValue() != Mask)
20652 return SDValue();
20653
20654 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20655 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20656}
20657
20659 SelectionDAG &DAG) {
20660 SDLoc dl(N);
20661 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20662 DAG.getNode(Opc, dl,
20663 N->getOperand(1).getSimpleValueType(),
20664 N->getOperand(1)),
20665 DAG.getConstant(0, dl, MVT::i64));
20666}
20667
20669 SDLoc DL(N);
20670 SDValue Op1 = N->getOperand(1);
20671 SDValue Op2 = N->getOperand(2);
20672 EVT ScalarTy = Op2.getValueType();
20673 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20674 ScalarTy = MVT::i32;
20675
20676 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20677 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20678 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20679 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20680 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20681 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20682}
20683
20685 SDLoc dl(N);
20686 SDValue Scalar = N->getOperand(3);
20687 EVT ScalarTy = Scalar.getValueType();
20688
20689 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20690 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20691
20692 SDValue Passthru = N->getOperand(1);
20693 SDValue Pred = N->getOperand(2);
20694 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20695 Pred, Scalar, Passthru);
20696}
20697
20699 SDLoc dl(N);
20700 LLVMContext &Ctx = *DAG.getContext();
20701 EVT VT = N->getValueType(0);
20702
20703 assert(VT.isScalableVector() && "Expected a scalable vector.");
20704
20705 // Current lowering only supports the SVE-ACLE types.
20707 return SDValue();
20708
20709 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20710 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20711 EVT ByteVT =
20712 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20713
20714 // Convert everything to the domain of EXT (i.e bytes).
20715 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20716 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20717 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20718 DAG.getConstant(ElemSize, dl, MVT::i32));
20719
20720 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20721 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20722}
20723
20726 SelectionDAG &DAG) {
20727 if (DCI.isBeforeLegalize())
20728 return SDValue();
20729
20730 SDValue Comparator = N->getOperand(3);
20731 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20732 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20733 unsigned IID = getIntrinsicID(N);
20734 EVT VT = N->getValueType(0);
20735 EVT CmpVT = N->getOperand(2).getValueType();
20736 SDValue Pred = N->getOperand(1);
20737 SDValue Imm;
20738 SDLoc DL(N);
20739
20740 switch (IID) {
20741 default:
20742 llvm_unreachable("Called with wrong intrinsic!");
20743 break;
20744
20745 // Signed comparisons
20746 case Intrinsic::aarch64_sve_cmpeq_wide:
20747 case Intrinsic::aarch64_sve_cmpne_wide:
20748 case Intrinsic::aarch64_sve_cmpge_wide:
20749 case Intrinsic::aarch64_sve_cmpgt_wide:
20750 case Intrinsic::aarch64_sve_cmplt_wide:
20751 case Intrinsic::aarch64_sve_cmple_wide: {
20752 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20753 int64_t ImmVal = CN->getSExtValue();
20754 if (ImmVal >= -16 && ImmVal <= 15)
20755 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20756 else
20757 return SDValue();
20758 }
20759 break;
20760 }
20761 // Unsigned comparisons
20762 case Intrinsic::aarch64_sve_cmphs_wide:
20763 case Intrinsic::aarch64_sve_cmphi_wide:
20764 case Intrinsic::aarch64_sve_cmplo_wide:
20765 case Intrinsic::aarch64_sve_cmpls_wide: {
20766 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20767 uint64_t ImmVal = CN->getZExtValue();
20768 if (ImmVal <= 127)
20769 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20770 else
20771 return SDValue();
20772 }
20773 break;
20774 }
20775 }
20776
20777 if (!Imm)
20778 return SDValue();
20779
20780 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20781 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20782 N->getOperand(2), Splat, DAG.getCondCode(CC));
20783 }
20784
20785 return SDValue();
20786}
20787
20790 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20791
20792 SDLoc DL(Op);
20793 assert(Op.getValueType().isScalableVector() &&
20794 TLI.isTypeLegal(Op.getValueType()) &&
20795 "Expected legal scalable vector type!");
20796 assert(Op.getValueType() == Pg.getValueType() &&
20797 "Expected same type for PTEST operands");
20798
20799 // Ensure target specific opcodes are using legal type.
20800 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20801 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20802 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20803
20804 // Ensure operands have type nxv16i1.
20805 if (Op.getValueType() != MVT::nxv16i1) {
20808 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20809 else
20810 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20811 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20812 }
20813
20814 // Set condition code (CC) flags.
20815 SDValue Test = DAG.getNode(
20817 DL, MVT::Other, Pg, Op);
20818
20819 // Convert CC to integer based on requested condition.
20820 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20821 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20822 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20823 return DAG.getZExtOrTrunc(Res, DL, VT);
20824}
20825
20827 SelectionDAG &DAG) {
20828 SDLoc DL(N);
20829
20830 SDValue Pred = N->getOperand(1);
20831 SDValue VecToReduce = N->getOperand(2);
20832
20833 // NOTE: The integer reduction's result type is not always linked to the
20834 // operand's element type so we construct it from the intrinsic's result type.
20835 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20836 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20837
20838 // SVE reductions set the whole vector register with the first element
20839 // containing the reduction result, which we'll now extract.
20840 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20841 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20842 Zero);
20843}
20844
20846 SelectionDAG &DAG) {
20847 SDLoc DL(N);
20848
20849 SDValue Pred = N->getOperand(1);
20850 SDValue VecToReduce = N->getOperand(2);
20851
20852 EVT ReduceVT = VecToReduce.getValueType();
20853 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20854
20855 // SVE reductions set the whole vector register with the first element
20856 // containing the reduction result, which we'll now extract.
20857 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20858 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20859 Zero);
20860}
20861
20863 SelectionDAG &DAG) {
20864 SDLoc DL(N);
20865
20866 SDValue Pred = N->getOperand(1);
20867 SDValue InitVal = N->getOperand(2);
20868 SDValue VecToReduce = N->getOperand(3);
20869 EVT ReduceVT = VecToReduce.getValueType();
20870
20871 // Ordered reductions use the first lane of the result vector as the
20872 // reduction's initial value.
20873 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20874 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20875 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20876
20877 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20878
20879 // SVE reductions set the whole vector register with the first element
20880 // containing the reduction result, which we'll now extract.
20881 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20882 Zero);
20883}
20884
20885// If a merged operation has no inactive lanes we can relax it to a predicated
20886// or unpredicated operation, which potentially allows better isel (perhaps
20887// using immediate forms) or relaxing register reuse requirements.
20889 SelectionDAG &DAG, bool UnpredOp = false,
20890 bool SwapOperands = false) {
20891 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20892 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20893 SDValue Pg = N->getOperand(1);
20894 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20895 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20896
20897 // ISD way to specify an all active predicate.
20898 if (isAllActivePredicate(DAG, Pg)) {
20899 if (UnpredOp)
20900 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20901
20902 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20903 }
20904
20905 // FUTURE: SplatVector(true)
20906 return SDValue();
20907}
20908
20911 const AArch64Subtarget *Subtarget) {
20912 if (DCI.isBeforeLegalize())
20913 return SDValue();
20914
20915 if (!Subtarget->hasSVE2p1())
20916 return SDValue();
20917
20918 if (!N->hasNUsesOfValue(2, 0))
20919 return SDValue();
20920
20921 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
20922 if (HalfSize < 2)
20923 return SDValue();
20924
20925 auto It = N->use_begin();
20926 SDNode *Lo = *It++;
20927 SDNode *Hi = *It;
20928
20929 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
20930 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
20931 return SDValue();
20932
20933 uint64_t OffLo = Lo->getConstantOperandVal(1);
20934 uint64_t OffHi = Hi->getConstantOperandVal(1);
20935
20936 if (OffLo > OffHi) {
20937 std::swap(Lo, Hi);
20938 std::swap(OffLo, OffHi);
20939 }
20940
20941 if (OffLo != 0 || OffHi != HalfSize)
20942 return SDValue();
20943
20944 EVT HalfVec = Lo->getValueType(0);
20945 if (HalfVec != Hi->getValueType(0) ||
20946 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
20947 return SDValue();
20948
20949 SelectionDAG &DAG = DCI.DAG;
20950 SDLoc DL(N);
20951 SDValue ID =
20952 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
20953 SDValue Idx = N->getOperand(1);
20954 SDValue TC = N->getOperand(2);
20955 if (Idx.getValueType() != MVT::i64) {
20956 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
20957 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
20958 }
20959 auto R =
20961 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
20962
20963 DCI.CombineTo(Lo, R.getValue(0));
20964 DCI.CombineTo(Hi, R.getValue(1));
20965
20966 return SDValue(N, 0);
20967}
20968
20971 const AArch64Subtarget *Subtarget) {
20972 SelectionDAG &DAG = DCI.DAG;
20973 unsigned IID = getIntrinsicID(N);
20974 switch (IID) {
20975 default:
20976 break;
20977 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20978 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20979 return tryCombineFixedPointConvert(N, DCI, DAG);
20980 case Intrinsic::aarch64_neon_saddv:
20982 case Intrinsic::aarch64_neon_uaddv:
20984 case Intrinsic::aarch64_neon_sminv:
20986 case Intrinsic::aarch64_neon_uminv:
20988 case Intrinsic::aarch64_neon_smaxv:
20990 case Intrinsic::aarch64_neon_umaxv:
20992 case Intrinsic::aarch64_neon_fmax:
20993 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20994 N->getOperand(1), N->getOperand(2));
20995 case Intrinsic::aarch64_neon_fmin:
20996 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20997 N->getOperand(1), N->getOperand(2));
20998 case Intrinsic::aarch64_neon_fmaxnm:
20999 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
21000 N->getOperand(1), N->getOperand(2));
21001 case Intrinsic::aarch64_neon_fminnm:
21002 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
21003 N->getOperand(1), N->getOperand(2));
21004 case Intrinsic::aarch64_neon_smull:
21005 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
21006 N->getOperand(1), N->getOperand(2));
21007 case Intrinsic::aarch64_neon_umull:
21008 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
21009 N->getOperand(1), N->getOperand(2));
21010 case Intrinsic::aarch64_neon_pmull:
21011 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
21012 N->getOperand(1), N->getOperand(2));
21013 case Intrinsic::aarch64_neon_sqdmull:
21014 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
21015 case Intrinsic::aarch64_neon_sqshl:
21016 case Intrinsic::aarch64_neon_uqshl:
21017 case Intrinsic::aarch64_neon_sqshlu:
21018 case Intrinsic::aarch64_neon_srshl:
21019 case Intrinsic::aarch64_neon_urshl:
21020 case Intrinsic::aarch64_neon_sshl:
21021 case Intrinsic::aarch64_neon_ushl:
21022 return tryCombineShiftImm(IID, N, DAG);
21023 case Intrinsic::aarch64_neon_sabd:
21024 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
21025 N->getOperand(1), N->getOperand(2));
21026 case Intrinsic::aarch64_neon_uabd:
21027 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
21028 N->getOperand(1), N->getOperand(2));
21029 case Intrinsic::aarch64_crc32b:
21030 case Intrinsic::aarch64_crc32cb:
21031 return tryCombineCRC32(0xff, N, DAG);
21032 case Intrinsic::aarch64_crc32h:
21033 case Intrinsic::aarch64_crc32ch:
21034 return tryCombineCRC32(0xffff, N, DAG);
21035 case Intrinsic::aarch64_sve_saddv:
21036 // There is no i64 version of SADDV because the sign is irrelevant.
21037 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
21039 else
21041 case Intrinsic::aarch64_sve_uaddv:
21043 case Intrinsic::aarch64_sve_smaxv:
21045 case Intrinsic::aarch64_sve_umaxv:
21047 case Intrinsic::aarch64_sve_sminv:
21049 case Intrinsic::aarch64_sve_uminv:
21051 case Intrinsic::aarch64_sve_orv:
21053 case Intrinsic::aarch64_sve_eorv:
21055 case Intrinsic::aarch64_sve_andv:
21057 case Intrinsic::aarch64_sve_index:
21058 return LowerSVEIntrinsicIndex(N, DAG);
21059 case Intrinsic::aarch64_sve_dup:
21060 return LowerSVEIntrinsicDUP(N, DAG);
21061 case Intrinsic::aarch64_sve_dup_x:
21062 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
21063 N->getOperand(1));
21064 case Intrinsic::aarch64_sve_ext:
21065 return LowerSVEIntrinsicEXT(N, DAG);
21066 case Intrinsic::aarch64_sve_mul_u:
21067 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
21068 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21069 case Intrinsic::aarch64_sve_smulh_u:
21070 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
21071 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21072 case Intrinsic::aarch64_sve_umulh_u:
21073 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
21074 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21075 case Intrinsic::aarch64_sve_smin_u:
21076 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
21077 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21078 case Intrinsic::aarch64_sve_umin_u:
21079 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
21080 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21081 case Intrinsic::aarch64_sve_smax_u:
21082 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
21083 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21084 case Intrinsic::aarch64_sve_umax_u:
21085 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
21086 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21087 case Intrinsic::aarch64_sve_lsl_u:
21088 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
21089 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21090 case Intrinsic::aarch64_sve_lsr_u:
21091 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
21092 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21093 case Intrinsic::aarch64_sve_asr_u:
21094 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
21095 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21096 case Intrinsic::aarch64_sve_fadd_u:
21097 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
21098 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21099 case Intrinsic::aarch64_sve_fdiv_u:
21100 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
21101 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21102 case Intrinsic::aarch64_sve_fmax_u:
21103 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
21104 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21105 case Intrinsic::aarch64_sve_fmaxnm_u:
21106 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
21107 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21108 case Intrinsic::aarch64_sve_fmla_u:
21109 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
21110 N->getOperand(1), N->getOperand(3), N->getOperand(4),
21111 N->getOperand(2));
21112 case Intrinsic::aarch64_sve_fmin_u:
21113 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
21114 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21115 case Intrinsic::aarch64_sve_fminnm_u:
21116 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
21117 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21118 case Intrinsic::aarch64_sve_fmul_u:
21119 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
21120 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21121 case Intrinsic::aarch64_sve_fsub_u:
21122 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
21123 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21124 case Intrinsic::aarch64_sve_add_u:
21125 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
21126 N->getOperand(3));
21127 case Intrinsic::aarch64_sve_sub_u:
21128 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
21129 N->getOperand(3));
21130 case Intrinsic::aarch64_sve_subr:
21131 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
21132 case Intrinsic::aarch64_sve_and_u:
21133 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
21134 N->getOperand(3));
21135 case Intrinsic::aarch64_sve_bic_u:
21136 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
21137 N->getOperand(2), N->getOperand(3));
21138 case Intrinsic::aarch64_sve_eor_u:
21139 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
21140 N->getOperand(3));
21141 case Intrinsic::aarch64_sve_orr_u:
21142 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
21143 N->getOperand(3));
21144 case Intrinsic::aarch64_sve_sabd_u:
21145 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
21146 N->getOperand(2), N->getOperand(3));
21147 case Intrinsic::aarch64_sve_uabd_u:
21148 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
21149 N->getOperand(2), N->getOperand(3));
21150 case Intrinsic::aarch64_sve_sdiv_u:
21151 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
21152 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21153 case Intrinsic::aarch64_sve_udiv_u:
21154 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
21155 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21156 case Intrinsic::aarch64_sve_sqadd:
21157 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
21158 case Intrinsic::aarch64_sve_sqsub_u:
21159 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
21160 N->getOperand(2), N->getOperand(3));
21161 case Intrinsic::aarch64_sve_uqadd:
21162 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
21163 case Intrinsic::aarch64_sve_uqsub_u:
21164 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
21165 N->getOperand(2), N->getOperand(3));
21166 case Intrinsic::aarch64_sve_sqadd_x:
21167 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
21168 N->getOperand(1), N->getOperand(2));
21169 case Intrinsic::aarch64_sve_sqsub_x:
21170 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
21171 N->getOperand(1), N->getOperand(2));
21172 case Intrinsic::aarch64_sve_uqadd_x:
21173 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
21174 N->getOperand(1), N->getOperand(2));
21175 case Intrinsic::aarch64_sve_uqsub_x:
21176 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
21177 N->getOperand(1), N->getOperand(2));
21178 case Intrinsic::aarch64_sve_asrd:
21179 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
21180 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21181 case Intrinsic::aarch64_sve_cmphs:
21182 if (!N->getOperand(2).getValueType().isFloatingPoint())
21184 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21185 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
21186 break;
21187 case Intrinsic::aarch64_sve_cmphi:
21188 if (!N->getOperand(2).getValueType().isFloatingPoint())
21190 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21191 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
21192 break;
21193 case Intrinsic::aarch64_sve_fcmpge:
21194 case Intrinsic::aarch64_sve_cmpge:
21196 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21197 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
21198 break;
21199 case Intrinsic::aarch64_sve_fcmpgt:
21200 case Intrinsic::aarch64_sve_cmpgt:
21202 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21203 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
21204 break;
21205 case Intrinsic::aarch64_sve_fcmpeq:
21206 case Intrinsic::aarch64_sve_cmpeq:
21208 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21209 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
21210 break;
21211 case Intrinsic::aarch64_sve_fcmpne:
21212 case Intrinsic::aarch64_sve_cmpne:
21214 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21215 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
21216 break;
21217 case Intrinsic::aarch64_sve_fcmpuo:
21219 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21220 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
21221 break;
21222 case Intrinsic::aarch64_sve_fadda:
21224 case Intrinsic::aarch64_sve_faddv:
21226 case Intrinsic::aarch64_sve_fmaxnmv:
21228 case Intrinsic::aarch64_sve_fmaxv:
21230 case Intrinsic::aarch64_sve_fminnmv:
21232 case Intrinsic::aarch64_sve_fminv:
21234 case Intrinsic::aarch64_sve_sel:
21235 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
21236 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21237 case Intrinsic::aarch64_sve_cmpeq_wide:
21238 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
21239 case Intrinsic::aarch64_sve_cmpne_wide:
21240 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
21241 case Intrinsic::aarch64_sve_cmpge_wide:
21242 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
21243 case Intrinsic::aarch64_sve_cmpgt_wide:
21244 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
21245 case Intrinsic::aarch64_sve_cmplt_wide:
21246 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
21247 case Intrinsic::aarch64_sve_cmple_wide:
21248 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
21249 case Intrinsic::aarch64_sve_cmphs_wide:
21250 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
21251 case Intrinsic::aarch64_sve_cmphi_wide:
21252 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
21253 case Intrinsic::aarch64_sve_cmplo_wide:
21254 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
21255 case Intrinsic::aarch64_sve_cmpls_wide:
21256 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
21257 case Intrinsic::aarch64_sve_ptest_any:
21258 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21260 case Intrinsic::aarch64_sve_ptest_first:
21261 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21263 case Intrinsic::aarch64_sve_ptest_last:
21264 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21266 case Intrinsic::aarch64_sve_whilelo:
21267 return tryCombineWhileLo(N, DCI, Subtarget);
21268 }
21269 return SDValue();
21270}
21271
21272static bool isCheapToExtend(const SDValue &N) {
21273 unsigned OC = N->getOpcode();
21274 return OC == ISD::LOAD || OC == ISD::MLOAD ||
21276}
21277
21278static SDValue
21280 SelectionDAG &DAG) {
21281 // If we have (sext (setcc A B)) and A and B are cheap to extend,
21282 // we can move the sext into the arguments and have the same result. For
21283 // example, if A and B are both loads, we can make those extending loads and
21284 // avoid an extra instruction. This pattern appears often in VLS code
21285 // generation where the inputs to the setcc have a different size to the
21286 // instruction that wants to use the result of the setcc.
21287 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
21288 N->getOperand(0)->getOpcode() == ISD::SETCC);
21289 const SDValue SetCC = N->getOperand(0);
21290
21291 const SDValue CCOp0 = SetCC.getOperand(0);
21292 const SDValue CCOp1 = SetCC.getOperand(1);
21293 if (!CCOp0->getValueType(0).isInteger() ||
21294 !CCOp1->getValueType(0).isInteger())
21295 return SDValue();
21296
21297 ISD::CondCode Code =
21298 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
21299
21300 ISD::NodeType ExtType =
21301 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21302
21303 if (isCheapToExtend(SetCC.getOperand(0)) &&
21304 isCheapToExtend(SetCC.getOperand(1))) {
21305 const SDValue Ext1 =
21306 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
21307 const SDValue Ext2 =
21308 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
21309
21310 return DAG.getSetCC(
21311 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
21312 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
21313 }
21314
21315 return SDValue();
21316}
21317
21320 SelectionDAG &DAG) {
21321 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
21322 // we can convert that DUP into another extract_high (of a bigger DUP), which
21323 // helps the backend to decide that an sabdl2 would be useful, saving a real
21324 // extract_high operation.
21325 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
21326 (N->getOperand(0).getOpcode() == ISD::ABDU ||
21327 N->getOperand(0).getOpcode() == ISD::ABDS)) {
21328 SDNode *ABDNode = N->getOperand(0).getNode();
21329 SDValue NewABD =
21331 if (!NewABD.getNode())
21332 return SDValue();
21333
21334 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
21335 }
21336
21337 if (N->getValueType(0).isFixedLengthVector() &&
21338 N->getOpcode() == ISD::SIGN_EXTEND &&
21339 N->getOperand(0)->getOpcode() == ISD::SETCC)
21340 return performSignExtendSetCCCombine(N, DCI, DAG);
21341
21342 return SDValue();
21343}
21344
21346 SDValue SplatVal, unsigned NumVecElts) {
21347 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
21348 Align OrigAlignment = St.getAlign();
21349 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
21350
21351 // Create scalar stores. This is at least as good as the code sequence for a
21352 // split unaligned store which is a dup.s, ext.b, and two stores.
21353 // Most of the time the three stores should be replaced by store pair
21354 // instructions (stp).
21355 SDLoc DL(&St);
21356 SDValue BasePtr = St.getBasePtr();
21357 uint64_t BaseOffset = 0;
21358
21359 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
21360 SDValue NewST1 =
21361 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
21362 OrigAlignment, St.getMemOperand()->getFlags());
21363
21364 // As this in ISel, we will not merge this add which may degrade results.
21365 if (BasePtr->getOpcode() == ISD::ADD &&
21366 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
21367 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
21368 BasePtr = BasePtr->getOperand(0);
21369 }
21370
21371 unsigned Offset = EltOffset;
21372 while (--NumVecElts) {
21373 Align Alignment = commonAlignment(OrigAlignment, Offset);
21374 SDValue OffsetPtr =
21375 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21376 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
21377 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
21378 PtrInfo.getWithOffset(Offset), Alignment,
21379 St.getMemOperand()->getFlags());
21380 Offset += EltOffset;
21381 }
21382 return NewST1;
21383}
21384
21385// Returns an SVE type that ContentTy can be trivially sign or zero extended
21386// into.
21387static MVT getSVEContainerType(EVT ContentTy) {
21388 assert(ContentTy.isSimple() && "No SVE containers for extended types");
21389
21390 switch (ContentTy.getSimpleVT().SimpleTy) {
21391 default:
21392 llvm_unreachable("No known SVE container for this MVT type");
21393 case MVT::nxv2i8:
21394 case MVT::nxv2i16:
21395 case MVT::nxv2i32:
21396 case MVT::nxv2i64:
21397 case MVT::nxv2f32:
21398 case MVT::nxv2f64:
21399 return MVT::nxv2i64;
21400 case MVT::nxv4i8:
21401 case MVT::nxv4i16:
21402 case MVT::nxv4i32:
21403 case MVT::nxv4f32:
21404 return MVT::nxv4i32;
21405 case MVT::nxv8i8:
21406 case MVT::nxv8i16:
21407 case MVT::nxv8f16:
21408 case MVT::nxv8bf16:
21409 return MVT::nxv8i16;
21410 case MVT::nxv16i8:
21411 return MVT::nxv16i8;
21412 }
21413}
21414
21415static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21416 SDLoc DL(N);
21417 EVT VT = N->getValueType(0);
21418
21420 return SDValue();
21421
21422 EVT ContainerVT = VT;
21423 if (ContainerVT.isInteger())
21424 ContainerVT = getSVEContainerType(ContainerVT);
21425
21426 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21427 SDValue Ops[] = { N->getOperand(0), // Chain
21428 N->getOperand(2), // Pg
21429 N->getOperand(3), // Base
21430 DAG.getValueType(VT) };
21431
21432 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21433 SDValue LoadChain = SDValue(Load.getNode(), 1);
21434
21435 if (ContainerVT.isInteger() && (VT != ContainerVT))
21436 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21437
21438 return DAG.getMergeValues({ Load, LoadChain }, DL);
21439}
21440
21442 SDLoc DL(N);
21443 EVT VT = N->getValueType(0);
21444 EVT PtrTy = N->getOperand(3).getValueType();
21445
21446 EVT LoadVT = VT;
21447 if (VT.isFloatingPoint())
21448 LoadVT = VT.changeTypeToInteger();
21449
21450 auto *MINode = cast<MemIntrinsicSDNode>(N);
21451 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21452 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21453 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21454 MINode->getOperand(2), PassThru,
21455 MINode->getMemoryVT(), MINode->getMemOperand(),
21457
21458 if (VT.isFloatingPoint()) {
21459 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21460 return DAG.getMergeValues(Ops, DL);
21461 }
21462
21463 return L;
21464}
21465
21466template <unsigned Opcode>
21468 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21470 "Unsupported opcode.");
21471 SDLoc DL(N);
21472 EVT VT = N->getValueType(0);
21473
21474 EVT LoadVT = VT;
21475 if (VT.isFloatingPoint())
21476 LoadVT = VT.changeTypeToInteger();
21477
21478 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21479 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21480 SDValue LoadChain = SDValue(Load.getNode(), 1);
21481
21482 if (VT.isFloatingPoint())
21483 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21484
21485 return DAG.getMergeValues({Load, LoadChain}, DL);
21486}
21487
21489 SDLoc DL(N);
21490 SDValue Data = N->getOperand(2);
21491 EVT DataVT = Data.getValueType();
21492 EVT HwSrcVt = getSVEContainerType(DataVT);
21493 SDValue InputVT = DAG.getValueType(DataVT);
21494
21495 if (DataVT.isFloatingPoint())
21496 InputVT = DAG.getValueType(HwSrcVt);
21497
21498 SDValue SrcNew;
21499 if (Data.getValueType().isFloatingPoint())
21500 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21501 else
21502 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21503
21504 SDValue Ops[] = { N->getOperand(0), // Chain
21505 SrcNew,
21506 N->getOperand(4), // Base
21507 N->getOperand(3), // Pg
21508 InputVT
21509 };
21510
21511 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21512}
21513
21515 SDLoc DL(N);
21516
21517 SDValue Data = N->getOperand(2);
21518 EVT DataVT = Data.getValueType();
21519 EVT PtrTy = N->getOperand(4).getValueType();
21520
21521 if (DataVT.isFloatingPoint())
21523
21524 auto *MINode = cast<MemIntrinsicSDNode>(N);
21525 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21526 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21527 MINode->getMemoryVT(), MINode->getMemOperand(),
21528 ISD::UNINDEXED, false, false);
21529}
21530
21531/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21532/// load store optimizer pass will merge them to store pair stores. This should
21533/// be better than a movi to create the vector zero followed by a vector store
21534/// if the zero constant is not re-used, since one instructions and one register
21535/// live range will be removed.
21536///
21537/// For example, the final generated code should be:
21538///
21539/// stp xzr, xzr, [x0]
21540///
21541/// instead of:
21542///
21543/// movi v0.2d, #0
21544/// str q0, [x0]
21545///
21547 SDValue StVal = St.getValue();
21548 EVT VT = StVal.getValueType();
21549
21550 // Avoid scalarizing zero splat stores for scalable vectors.
21551 if (VT.isScalableVector())
21552 return SDValue();
21553
21554 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21555 // 2, 3 or 4 i32 elements.
21556 int NumVecElts = VT.getVectorNumElements();
21557 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21558 VT.getVectorElementType().getSizeInBits() == 64) ||
21559 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21560 VT.getVectorElementType().getSizeInBits() == 32)))
21561 return SDValue();
21562
21563 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21564 return SDValue();
21565
21566 // If the zero constant has more than one use then the vector store could be
21567 // better since the constant mov will be amortized and stp q instructions
21568 // should be able to be formed.
21569 if (!StVal.hasOneUse())
21570 return SDValue();
21571
21572 // If the store is truncating then it's going down to i16 or smaller, which
21573 // means it can be implemented in a single store anyway.
21574 if (St.isTruncatingStore())
21575 return SDValue();
21576
21577 // If the immediate offset of the address operand is too large for the stp
21578 // instruction, then bail out.
21579 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21580 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21581 if (Offset < -512 || Offset > 504)
21582 return SDValue();
21583 }
21584
21585 for (int I = 0; I < NumVecElts; ++I) {
21586 SDValue EltVal = StVal.getOperand(I);
21587 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21588 return SDValue();
21589 }
21590
21591 // Use a CopyFromReg WZR/XZR here to prevent
21592 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21593 SDLoc DL(&St);
21594 unsigned ZeroReg;
21595 EVT ZeroVT;
21596 if (VT.getVectorElementType().getSizeInBits() == 32) {
21597 ZeroReg = AArch64::WZR;
21598 ZeroVT = MVT::i32;
21599 } else {
21600 ZeroReg = AArch64::XZR;
21601 ZeroVT = MVT::i64;
21602 }
21603 SDValue SplatVal =
21604 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21605 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21606}
21607
21608/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21609/// value. The load store optimizer pass will merge them to store pair stores.
21610/// This has better performance than a splat of the scalar followed by a split
21611/// vector store. Even if the stores are not merged it is four stores vs a dup,
21612/// followed by an ext.b and two stores.
21614 SDValue StVal = St.getValue();
21615 EVT VT = StVal.getValueType();
21616
21617 // Don't replace floating point stores, they possibly won't be transformed to
21618 // stp because of the store pair suppress pass.
21619 if (VT.isFloatingPoint())
21620 return SDValue();
21621
21622 // We can express a splat as store pair(s) for 2 or 4 elements.
21623 unsigned NumVecElts = VT.getVectorNumElements();
21624 if (NumVecElts != 4 && NumVecElts != 2)
21625 return SDValue();
21626
21627 // If the store is truncating then it's going down to i16 or smaller, which
21628 // means it can be implemented in a single store anyway.
21629 if (St.isTruncatingStore())
21630 return SDValue();
21631
21632 // Check that this is a splat.
21633 // Make sure that each of the relevant vector element locations are inserted
21634 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21635 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21636 SDValue SplatVal;
21637 for (unsigned I = 0; I < NumVecElts; ++I) {
21638 // Check for insert vector elements.
21639 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21640 return SDValue();
21641
21642 // Check that same value is inserted at each vector element.
21643 if (I == 0)
21644 SplatVal = StVal.getOperand(1);
21645 else if (StVal.getOperand(1) != SplatVal)
21646 return SDValue();
21647
21648 // Check insert element index.
21649 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21650 if (!CIndex)
21651 return SDValue();
21652 uint64_t IndexVal = CIndex->getZExtValue();
21653 if (IndexVal >= NumVecElts)
21654 return SDValue();
21655 IndexNotInserted.reset(IndexVal);
21656
21657 StVal = StVal.getOperand(0);
21658 }
21659 // Check that all vector element locations were inserted to.
21660 if (IndexNotInserted.any())
21661 return SDValue();
21662
21663 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21664}
21665
21667 SelectionDAG &DAG,
21668 const AArch64Subtarget *Subtarget) {
21669
21670 StoreSDNode *S = cast<StoreSDNode>(N);
21671 if (S->isVolatile() || S->isIndexed())
21672 return SDValue();
21673
21674 SDValue StVal = S->getValue();
21675 EVT VT = StVal.getValueType();
21676
21677 if (!VT.isFixedLengthVector())
21678 return SDValue();
21679
21680 // If we get a splat of zeros, convert this vector store to a store of
21681 // scalars. They will be merged into store pairs of xzr thereby removing one
21682 // instruction and one register.
21683 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21684 return ReplacedZeroSplat;
21685
21686 // FIXME: The logic for deciding if an unaligned store should be split should
21687 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21688 // a call to that function here.
21689
21690 if (!Subtarget->isMisaligned128StoreSlow())
21691 return SDValue();
21692
21693 // Don't split at -Oz.
21695 return SDValue();
21696
21697 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21698 // those up regresses performance on micro-benchmarks and olden/bh.
21699 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21700 return SDValue();
21701
21702 // Split unaligned 16B stores. They are terrible for performance.
21703 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21704 // extensions can use this to mark that it does not want splitting to happen
21705 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21706 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21707 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21708 S->getAlign() <= Align(2))
21709 return SDValue();
21710
21711 // If we get a splat of a scalar convert this vector store to a store of
21712 // scalars. They will be merged into store pairs thereby removing two
21713 // instructions.
21714 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21715 return ReplacedSplat;
21716
21717 SDLoc DL(S);
21718
21719 // Split VT into two.
21720 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21721 unsigned NumElts = HalfVT.getVectorNumElements();
21722 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21723 DAG.getConstant(0, DL, MVT::i64));
21724 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21725 DAG.getConstant(NumElts, DL, MVT::i64));
21726 SDValue BasePtr = S->getBasePtr();
21727 SDValue NewST1 =
21728 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21729 S->getAlign(), S->getMemOperand()->getFlags());
21730 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21731 DAG.getConstant(8, DL, MVT::i64));
21732 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21733 S->getPointerInfo(), S->getAlign(),
21734 S->getMemOperand()->getFlags());
21735}
21736
21738 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21739
21740 // splice(pg, op1, undef) -> op1
21741 if (N->getOperand(2).isUndef())
21742 return N->getOperand(1);
21743
21744 return SDValue();
21745}
21746
21748 const AArch64Subtarget *Subtarget) {
21749 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21750 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21751 "Unexpected Opcode!");
21752
21753 // uunpklo/hi undef -> undef
21754 if (N->getOperand(0).isUndef())
21755 return DAG.getUNDEF(N->getValueType(0));
21756
21757 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21758 // extending load. We can do this even if this is already a masked
21759 // {z,}extload.
21760 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21761 N->getOpcode() == AArch64ISD::UUNPKLO) {
21762 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21763 SDValue Mask = MLD->getMask();
21764 SDLoc DL(N);
21765
21766 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21767 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21768 (MLD->getPassThru()->isUndef() ||
21769 isZerosVector(MLD->getPassThru().getNode()))) {
21770 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21771 unsigned PgPattern = Mask->getConstantOperandVal(0);
21772 EVT VT = N->getValueType(0);
21773
21774 // Ensure we can double the size of the predicate pattern
21775 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21776 if (NumElts &&
21777 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21778 Mask =
21779 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21780 SDValue PassThru = DAG.getConstant(0, DL, VT);
21781 SDValue NewLoad = DAG.getMaskedLoad(
21782 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21783 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21785
21786 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21787
21788 return NewLoad;
21789 }
21790 }
21791 }
21792
21793 return SDValue();
21794}
21795
21797 if (N->getOpcode() != AArch64ISD::UZP1)
21798 return false;
21799 SDValue Op0 = N->getOperand(0);
21800 EVT SrcVT = Op0->getValueType(0);
21801 EVT DstVT = N->getValueType(0);
21802 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21803 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21804 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21805}
21806
21807// Try to combine rounding shifts where the operands come from an extend, and
21808// the result is truncated and combined into one vector.
21809// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21811 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21812 SDValue Op0 = N->getOperand(0);
21813 SDValue Op1 = N->getOperand(1);
21814 EVT ResVT = N->getValueType(0);
21815
21816 unsigned RshOpc = Op0.getOpcode();
21817 if (RshOpc != AArch64ISD::RSHRNB_I)
21818 return SDValue();
21819
21820 // Same op code and imm value?
21821 SDValue ShiftValue = Op0.getOperand(1);
21822 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21823 return SDValue();
21824
21825 // Same unextended operand value?
21826 SDValue Lo = Op0.getOperand(0);
21827 SDValue Hi = Op1.getOperand(0);
21828 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21829 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21830 return SDValue();
21831 SDValue OrigArg = Lo.getOperand(0);
21832 if (OrigArg != Hi.getOperand(0))
21833 return SDValue();
21834
21835 SDLoc DL(N);
21836 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21837 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21838 ShiftValue);
21839}
21840
21841// Try to simplify:
21842// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21843// t2 = nxv8i16 srl(t1, ShiftValue)
21844// to
21845// t1 = nxv8i16 rshrnb(X, shiftvalue).
21846// rshrnb will zero the top half bits of each element. Therefore, this combine
21847// should only be performed when a following instruction with the rshrnb
21848// as an operand does not care about the top half of each element. For example,
21849// a uzp1 or a truncating store.
21851 const AArch64Subtarget *Subtarget) {
21852 EVT VT = Srl->getValueType(0);
21853 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21854 return SDValue();
21855
21856 EVT ResVT;
21857 if (VT == MVT::nxv8i16)
21858 ResVT = MVT::nxv16i8;
21859 else if (VT == MVT::nxv4i32)
21860 ResVT = MVT::nxv8i16;
21861 else if (VT == MVT::nxv2i64)
21862 ResVT = MVT::nxv4i32;
21863 else
21864 return SDValue();
21865
21866 SDLoc DL(Srl);
21867 unsigned ShiftValue;
21868 SDValue RShOperand;
21869 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21870 return SDValue();
21871 SDValue Rshrnb = DAG.getNode(
21872 AArch64ISD::RSHRNB_I, DL, ResVT,
21873 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21874 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21875}
21876
21878 const AArch64Subtarget *Subtarget) {
21879 SDLoc DL(N);
21880 SDValue Op0 = N->getOperand(0);
21881 SDValue Op1 = N->getOperand(1);
21882 EVT ResVT = N->getValueType(0);
21883
21884 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
21885 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21887 Op0.getOperand(0) == Op1.getOperand(0)) {
21888
21889 SDValue SourceVec = Op0.getOperand(0);
21890 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
21891 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
21892 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
21893 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
21894 EVT OpVT = Op0.getOperand(1).getValueType();
21895 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21896 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
21897 DAG.getUNDEF(WidenedResVT));
21898 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
21899 DAG.getConstant(0, DL, OpVT));
21900 }
21901 }
21902
21903 // Following optimizations only work with uzp1.
21904 if (N->getOpcode() == AArch64ISD::UZP2)
21905 return SDValue();
21906
21907 // uzp1(x, undef) -> concat(truncate(x), undef)
21908 if (Op1.getOpcode() == ISD::UNDEF) {
21909 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21910 switch (ResVT.getSimpleVT().SimpleTy) {
21911 default:
21912 break;
21913 case MVT::v16i8:
21914 BCVT = MVT::v8i16;
21915 HalfVT = MVT::v8i8;
21916 break;
21917 case MVT::v8i16:
21918 BCVT = MVT::v4i32;
21919 HalfVT = MVT::v4i16;
21920 break;
21921 case MVT::v4i32:
21922 BCVT = MVT::v2i64;
21923 HalfVT = MVT::v2i32;
21924 break;
21925 }
21926 if (BCVT != MVT::Other) {
21927 SDValue BC = DAG.getBitcast(BCVT, Op0);
21928 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21929 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21930 DAG.getUNDEF(HalfVT));
21931 }
21932 }
21933
21934 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21935 return Urshr;
21936
21937 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21938 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21939
21940 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21941 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21942
21943 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21944 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21945 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21946 SDValue X = Op0.getOperand(0).getOperand(0);
21947 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21948 }
21949 }
21950
21951 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21952 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21953 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21954 SDValue Z = Op1.getOperand(0).getOperand(1);
21955 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21956 }
21957 }
21958
21959 // These optimizations only work on little endian.
21960 if (!DAG.getDataLayout().isLittleEndian())
21961 return SDValue();
21962
21963 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21964 // Example:
21965 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21966 // to
21967 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21969 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21970 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21971 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21972 Op1.getOperand(0));
21973 }
21974 }
21975
21976 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21977 return SDValue();
21978
21979 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21980 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21981
21982 // truncating uzp1(x, y) -> xtn(concat (x, y))
21983 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21984 EVT Op0Ty = SourceOp0.getValueType();
21985 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21986 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21987 SDValue Concat =
21990 SourceOp0, SourceOp1);
21991 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21992 }
21993 }
21994
21995 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21996 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21997 SourceOp1.getOpcode() != ISD::TRUNCATE)
21998 return SDValue();
21999 SourceOp0 = SourceOp0.getOperand(0);
22000 SourceOp1 = SourceOp1.getOperand(0);
22001
22002 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
22003 !SourceOp0.getValueType().isSimple())
22004 return SDValue();
22005
22006 EVT ResultTy;
22007
22008 switch (SourceOp0.getSimpleValueType().SimpleTy) {
22009 case MVT::v2i64:
22010 ResultTy = MVT::v4i32;
22011 break;
22012 case MVT::v4i32:
22013 ResultTy = MVT::v8i16;
22014 break;
22015 case MVT::v8i16:
22016 ResultTy = MVT::v16i8;
22017 break;
22018 default:
22019 return SDValue();
22020 }
22021
22022 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
22023 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
22024 SDValue UzpResult =
22025 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
22026
22027 EVT BitcastResultTy;
22028
22029 switch (ResVT.getSimpleVT().SimpleTy) {
22030 case MVT::v2i32:
22031 BitcastResultTy = MVT::v2i64;
22032 break;
22033 case MVT::v4i16:
22034 BitcastResultTy = MVT::v4i32;
22035 break;
22036 case MVT::v8i8:
22037 BitcastResultTy = MVT::v8i16;
22038 break;
22039 default:
22040 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
22041 }
22042
22043 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
22044 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
22045}
22046
22048 unsigned Opc = N->getOpcode();
22049
22050 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
22052 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
22054 "Invalid opcode.");
22055
22056 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
22058 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
22060 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
22064
22065 SDLoc DL(N);
22066 SDValue Chain = N->getOperand(0);
22067 SDValue Pg = N->getOperand(1);
22068 SDValue Base = N->getOperand(2);
22069 SDValue Offset = N->getOperand(3);
22070 SDValue Ty = N->getOperand(4);
22071
22072 EVT ResVT = N->getValueType(0);
22073
22074 const auto OffsetOpc = Offset.getOpcode();
22075 const bool OffsetIsZExt =
22077 const bool OffsetIsSExt =
22079
22080 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
22081 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
22082 SDValue ExtPg = Offset.getOperand(0);
22083 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
22084 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
22085
22086 // If the predicate for the sign- or zero-extended offset is the
22087 // same as the predicate used for this load and the sign-/zero-extension
22088 // was from a 32-bits...
22089 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
22090 SDValue UnextendedOffset = Offset.getOperand(1);
22091
22092 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
22093 if (Signed)
22094 NewOpc = getSignExtendedGatherOpcode(NewOpc);
22095
22096 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
22097 {Chain, Pg, Base, UnextendedOffset, Ty});
22098 }
22099 }
22100
22101 return SDValue();
22102}
22103
22104/// Optimize a vector shift instruction and its operand if shifted out
22105/// bits are not used.
22107 const AArch64TargetLowering &TLI,
22109 assert(N->getOpcode() == AArch64ISD::VASHR ||
22110 N->getOpcode() == AArch64ISD::VLSHR);
22111
22112 SDValue Op = N->getOperand(0);
22113 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
22114
22115 unsigned ShiftImm = N->getConstantOperandVal(1);
22116 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
22117
22118 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
22119 if (N->getOpcode() == AArch64ISD::VASHR &&
22120 Op.getOpcode() == AArch64ISD::VSHL &&
22121 N->getOperand(1) == Op.getOperand(1))
22122 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
22123 return Op.getOperand(0);
22124
22125 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
22126 APInt DemandedMask = ~ShiftedOutBits;
22127
22128 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
22129 return SDValue(N, 0);
22130
22131 return SDValue();
22132}
22133
22135 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
22136 // This transform works in partnership with performSetCCPunpkCombine to
22137 // remove unnecessary transfer of predicates into standard registers and back
22138 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
22139 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
22140 MVT::i1) {
22141 SDValue CC = N->getOperand(0)->getOperand(0);
22142 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
22143 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
22144 DAG.getVectorIdxConstant(0, SDLoc(N)));
22145 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
22146 }
22147
22148 return SDValue();
22149}
22150
22151/// Target-specific DAG combine function for post-increment LD1 (lane) and
22152/// post-increment LD1R.
22155 bool IsLaneOp) {
22156 if (DCI.isBeforeLegalizeOps())
22157 return SDValue();
22158
22159 SelectionDAG &DAG = DCI.DAG;
22160 EVT VT = N->getValueType(0);
22161
22162 if (!VT.is128BitVector() && !VT.is64BitVector())
22163 return SDValue();
22164
22165 unsigned LoadIdx = IsLaneOp ? 1 : 0;
22166 SDNode *LD = N->getOperand(LoadIdx).getNode();
22167 // If it is not LOAD, can not do such combine.
22168 if (LD->getOpcode() != ISD::LOAD)
22169 return SDValue();
22170
22171 // The vector lane must be a constant in the LD1LANE opcode.
22172 SDValue Lane;
22173 if (IsLaneOp) {
22174 Lane = N->getOperand(2);
22175 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
22176 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
22177 return SDValue();
22178 }
22179
22180 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
22181 EVT MemVT = LoadSDN->getMemoryVT();
22182 // Check if memory operand is the same type as the vector element.
22183 if (MemVT != VT.getVectorElementType())
22184 return SDValue();
22185
22186 // Check if there are other uses. If so, do not combine as it will introduce
22187 // an extra load.
22188 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
22189 ++UI) {
22190 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
22191 continue;
22192 if (*UI != N)
22193 return SDValue();
22194 }
22195
22196 // If there is one use and it can splat the value, prefer that operation.
22197 // TODO: This could be expanded to more operations if they reliably use the
22198 // index variants.
22199 if (N->hasOneUse()) {
22200 unsigned UseOpc = N->use_begin()->getOpcode();
22201 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
22202 return SDValue();
22203 }
22204
22205 SDValue Addr = LD->getOperand(1);
22206 SDValue Vector = N->getOperand(0);
22207 // Search for a use of the address operand that is an increment.
22208 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
22209 Addr.getNode()->use_end(); UI != UE; ++UI) {
22210 SDNode *User = *UI;
22211 if (User->getOpcode() != ISD::ADD
22212 || UI.getUse().getResNo() != Addr.getResNo())
22213 continue;
22214
22215 // If the increment is a constant, it must match the memory ref size.
22216 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22217 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22218 uint32_t IncVal = CInc->getZExtValue();
22219 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
22220 if (IncVal != NumBytes)
22221 continue;
22222 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22223 }
22224
22225 // To avoid cycle construction make sure that neither the load nor the add
22226 // are predecessors to each other or the Vector.
22229 Visited.insert(Addr.getNode());
22230 Worklist.push_back(User);
22231 Worklist.push_back(LD);
22232 Worklist.push_back(Vector.getNode());
22233 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
22234 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22235 continue;
22236
22238 Ops.push_back(LD->getOperand(0)); // Chain
22239 if (IsLaneOp) {
22240 Ops.push_back(Vector); // The vector to be inserted
22241 Ops.push_back(Lane); // The lane to be inserted in the vector
22242 }
22243 Ops.push_back(Addr);
22244 Ops.push_back(Inc);
22245
22246 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
22247 SDVTList SDTys = DAG.getVTList(Tys);
22248 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
22249 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
22250 MemVT,
22251 LoadSDN->getMemOperand());
22252
22253 // Update the uses.
22254 SDValue NewResults[] = {
22255 SDValue(LD, 0), // The result of load
22256 SDValue(UpdN.getNode(), 2) // Chain
22257 };
22258 DCI.CombineTo(LD, NewResults);
22259 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
22260 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
22261
22262 break;
22263 }
22264 return SDValue();
22265}
22266
22267/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
22268/// address translation.
22271 SelectionDAG &DAG) {
22272 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
22273 KnownBits Known;
22275 !DCI.isBeforeLegalizeOps());
22276 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22277 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
22278 DCI.CommitTargetLoweringOpt(TLO);
22279 return true;
22280 }
22281 return false;
22282}
22283
22285 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
22286 "Expected STORE dag node in input!");
22287
22288 if (auto Store = dyn_cast<StoreSDNode>(N)) {
22289 if (!Store->isTruncatingStore() || Store->isIndexed())
22290 return SDValue();
22291 SDValue Ext = Store->getValue();
22292 auto ExtOpCode = Ext.getOpcode();
22293 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
22294 ExtOpCode != ISD::ANY_EXTEND)
22295 return SDValue();
22296 SDValue Orig = Ext->getOperand(0);
22297 if (Store->getMemoryVT() != Orig.getValueType())
22298 return SDValue();
22299 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
22300 Store->getBasePtr(), Store->getMemOperand());
22301 }
22302
22303 return SDValue();
22304}
22305
22306// A custom combine to lower load <3 x i8> as the more efficient sequence
22307// below:
22308// ldrb wX, [x0, #2]
22309// ldrh wY, [x0]
22310// orr wX, wY, wX, lsl #16
22311// fmov s0, wX
22312//
22313// Note that an alternative sequence with even fewer (although usually more
22314// complex/expensive) instructions would be:
22315// ld1r.4h { v0 }, [x0], #2
22316// ld1.b { v0 }[2], [x0]
22317//
22318// Generating this sequence unfortunately results in noticeably worse codegen
22319// for code that extends the loaded v3i8, due to legalization breaking vector
22320// shuffle detection in a way that is very difficult to work around.
22321// TODO: Revisit once v3i8 legalization has been improved in general.
22323 EVT MemVT = LD->getMemoryVT();
22324 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
22325 LD->getOriginalAlign() >= 4)
22326 return SDValue();
22327
22328 SDLoc DL(LD);
22330 SDValue Chain = LD->getChain();
22331 SDValue BasePtr = LD->getBasePtr();
22332 MachineMemOperand *MMO = LD->getMemOperand();
22333 assert(LD->getOffset().isUndef() && "undef offset expected");
22334
22335 // Load 2 x i8, then 1 x i8.
22336 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
22337 TypeSize Offset2 = TypeSize::getFixed(2);
22338 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
22339 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
22340 MF.getMachineMemOperand(MMO, 2, 1));
22341
22342 // Extend to i32.
22343 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
22344 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
22345
22346 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
22347 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
22348 DAG.getConstant(16, DL, MVT::i32));
22349 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
22350 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
22351
22352 // Extract v3i8 again.
22353 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
22354 DAG.getConstant(0, DL, MVT::i64));
22355 SDValue TokenFactor = DAG.getNode(
22356 ISD::TokenFactor, DL, MVT::Other,
22357 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
22358 return DAG.getMergeValues({Extract, TokenFactor}, DL);
22359}
22360
22361// Perform TBI simplification if supported by the target and try to break up
22362// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
22363// load instructions can be selected.
22366 SelectionDAG &DAG,
22367 const AArch64Subtarget *Subtarget) {
22368 if (Subtarget->supportsAddressTopByteIgnored())
22369 performTBISimplification(N->getOperand(1), DCI, DAG);
22370
22371 LoadSDNode *LD = cast<LoadSDNode>(N);
22372 if (LD->isVolatile() || !Subtarget->isLittleEndian())
22373 return SDValue(N, 0);
22374
22375 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
22376 return Res;
22377
22378 if (!LD->isNonTemporal())
22379 return SDValue(N, 0);
22380
22381 EVT MemVT = LD->getMemoryVT();
22382 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
22383 MemVT.getSizeInBits() % 256 == 0 ||
22384 256 % MemVT.getScalarSizeInBits() != 0)
22385 return SDValue(N, 0);
22386
22387 SDLoc DL(LD);
22388 SDValue Chain = LD->getChain();
22389 SDValue BasePtr = LD->getBasePtr();
22390 SDNodeFlags Flags = LD->getFlags();
22392 SmallVector<SDValue, 4> LoadOpsChain;
22393 // Replace any non temporal load over 256-bit with a series of 256 bit loads
22394 // and a scalar/vector load less than 256. This way we can utilize 256-bit
22395 // loads and reduce the amount of load instructions generated.
22396 MVT NewVT =
22398 256 / MemVT.getVectorElementType().getSizeInBits());
22399 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
22400 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
22401 for (unsigned I = 0; I < Num256Loads; I++) {
22402 unsigned PtrOffset = I * 32;
22403 SDValue NewPtr = DAG.getMemBasePlusOffset(
22404 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22405 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22406 SDValue NewLoad = DAG.getLoad(
22407 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
22408 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
22409 LoadOps.push_back(NewLoad);
22410 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
22411 }
22412
22413 // Process remaining bits of the load operation.
22414 // This is done by creating an UNDEF vector to match the size of the
22415 // 256-bit loads and inserting the remaining load to it. We extract the
22416 // original load type at the end using EXTRACT_SUBVECTOR instruction.
22417 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
22418 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
22419 MVT RemainingVT = MVT::getVectorVT(
22421 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22422 SDValue NewPtr = DAG.getMemBasePlusOffset(
22423 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22424 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22425 SDValue RemainingLoad =
22426 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
22427 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
22428 LD->getMemOperand()->getFlags(), LD->getAAInfo());
22429 SDValue UndefVector = DAG.getUNDEF(NewVT);
22430 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
22431 SDValue ExtendedReminingLoad =
22432 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
22433 {UndefVector, RemainingLoad, InsertIdx});
22434 LoadOps.push_back(ExtendedReminingLoad);
22435 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
22436 EVT ConcatVT =
22438 LoadOps.size() * NewVT.getVectorNumElements());
22439 SDValue ConcatVectors =
22440 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
22441 // Extract the original vector type size.
22442 SDValue ExtractSubVector =
22443 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
22444 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
22445 SDValue TokenFactor =
22446 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
22447 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
22448}
22449
22451 EVT VecVT = Op.getValueType();
22452 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22453 "Need boolean vector type.");
22454
22455 if (Depth > 3)
22457
22458 // We can get the base type from a vector compare or truncate.
22459 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22460 return Op.getOperand(0).getValueType();
22461
22462 // If an operand is a bool vector, continue looking.
22464 for (SDValue Operand : Op->op_values()) {
22465 if (Operand.getValueType() != VecVT)
22466 continue;
22467
22468 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22469 if (!BaseVT.isSimple())
22470 BaseVT = OperandVT;
22471 else if (OperandVT != BaseVT)
22473 }
22474
22475 return BaseVT;
22476}
22477
22478// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22479// iN, we can use a trick that extracts the i^th bit from the i^th element and
22480// then performs a vector add to get a scalar bitmask. This requires that each
22481// element's bits are either all 1 or all 0.
22483 SDLoc DL(N);
22484 SDValue ComparisonResult(N, 0);
22485 EVT VecVT = ComparisonResult.getValueType();
22486 assert(VecVT.isVector() && "Must be a vector type");
22487
22488 unsigned NumElts = VecVT.getVectorNumElements();
22489 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22490 return SDValue();
22491
22492 if (VecVT.getVectorElementType() != MVT::i1 &&
22493 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22494 return SDValue();
22495
22496 // If we can find the original types to work on instead of a vector of i1,
22497 // we can avoid extend/extract conversion instructions.
22498 if (VecVT.getVectorElementType() == MVT::i1) {
22499 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22500 if (!VecVT.isSimple()) {
22501 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22502 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22503 }
22504 }
22505 VecVT = VecVT.changeVectorElementTypeToInteger();
22506
22507 // Large vectors don't map directly to this conversion, so to avoid too many
22508 // edge cases, we don't apply it here. The conversion will likely still be
22509 // applied later via multiple smaller vectors, whose results are concatenated.
22510 if (VecVT.getSizeInBits() > 128)
22511 return SDValue();
22512
22513 // Ensure that all elements' bits are either 0s or 1s.
22514 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22515
22516 SmallVector<SDValue, 16> MaskConstants;
22518 VecVT == MVT::v16i8) {
22519 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22520 // per entry. We split it into two halves, apply the mask, zip the halves to
22521 // create 8x 16-bit values, and the perform the vector reduce.
22522 for (unsigned Half = 0; Half < 2; ++Half) {
22523 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22524 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22525 }
22526 }
22527 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22528 SDValue RepresentativeBits =
22529 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22530
22531 SDValue UpperRepresentativeBits =
22532 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22533 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22534 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22535 RepresentativeBits, UpperRepresentativeBits);
22536 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22537 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22538 }
22539
22540 // All other vector sizes.
22541 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22542 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22543 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22544 }
22545
22546 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22547 SDValue RepresentativeBits =
22548 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22549 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22550 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22551 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22552}
22553
22555 StoreSDNode *Store) {
22556 if (!Store->isTruncatingStore())
22557 return SDValue();
22558
22559 SDLoc DL(Store);
22560 SDValue VecOp = Store->getValue();
22561 EVT VT = VecOp.getValueType();
22562 EVT MemVT = Store->getMemoryVT();
22563
22564 if (!MemVT.isVector() || !VT.isVector() ||
22565 MemVT.getVectorElementType() != MVT::i1)
22566 return SDValue();
22567
22568 // If we are storing a vector that we are currently building, let
22569 // `scalarizeVectorStore()` handle this more efficiently.
22570 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22571 return SDValue();
22572
22573 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22574 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22575 if (!VectorBits)
22576 return SDValue();
22577
22578 EVT StoreVT =
22580 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22581 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22582 Store->getMemOperand());
22583}
22584
22586 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22587 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22588 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22589}
22590
22591// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22593 const AArch64Subtarget *Subtarget) {
22594 SDValue Value = ST->getValue();
22595 EVT ValueVT = Value.getValueType();
22596
22597 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22598 Value.getOpcode() != ISD::TRUNCATE ||
22599 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22600 return SDValue();
22601
22602 assert(ST->getOffset().isUndef() && "undef offset expected");
22603 SDLoc DL(ST);
22604 auto WideVT = EVT::getVectorVT(
22605 *DAG.getContext(),
22606 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22607 SDValue UndefVector = DAG.getUNDEF(WideVT);
22608 SDValue WideTrunc = DAG.getNode(
22609 ISD::INSERT_SUBVECTOR, DL, WideVT,
22610 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22611 SDValue Cast = DAG.getNode(
22612 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22613 WideTrunc);
22614
22616 SDValue Chain = ST->getChain();
22617 MachineMemOperand *MMO = ST->getMemOperand();
22618 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22619 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22620 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22621 TypeSize Offset2 = TypeSize::getFixed(2);
22622 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22623 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22624
22625 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22626 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22627 TypeSize Offset1 = TypeSize::getFixed(1);
22628 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22629 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22630
22631 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22632 DAG.getConstant(0, DL, MVT::i64));
22633 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22634 MF.getMachineMemOperand(MMO, 0, 1));
22635 return Chain;
22636}
22637
22640 SelectionDAG &DAG,
22641 const AArch64Subtarget *Subtarget) {
22642 StoreSDNode *ST = cast<StoreSDNode>(N);
22643 SDValue Chain = ST->getChain();
22644 SDValue Value = ST->getValue();
22645 SDValue Ptr = ST->getBasePtr();
22646 EVT ValueVT = Value.getValueType();
22647
22648 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22649 EVT EltVT = VT.getVectorElementType();
22650 return EltVT == MVT::f32 || EltVT == MVT::f64;
22651 };
22652
22653 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22654 return Res;
22655
22656 // If this is an FP_ROUND followed by a store, fold this into a truncating
22657 // store. We can do this even if this is already a truncstore.
22658 // We purposefully don't care about legality of the nodes here as we know
22659 // they can be split down into something legal.
22660 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22661 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22662 Subtarget->useSVEForFixedLengthVectors() &&
22663 ValueVT.isFixedLengthVector() &&
22664 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22665 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22666 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22667 ST->getMemoryVT(), ST->getMemOperand());
22668
22669 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22670 return Split;
22671
22672 if (Subtarget->supportsAddressTopByteIgnored() &&
22673 performTBISimplification(N->getOperand(2), DCI, DAG))
22674 return SDValue(N, 0);
22675
22676 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22677 return Store;
22678
22679 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22680 return Store;
22681
22682 if (ST->isTruncatingStore()) {
22683 EVT StoreVT = ST->getMemoryVT();
22684 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22685 return SDValue();
22686 if (SDValue Rshrnb =
22687 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22688 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22689 StoreVT, ST->getMemOperand());
22690 }
22691 }
22692
22693 return SDValue();
22694}
22695
22698 SelectionDAG &DAG,
22699 const AArch64Subtarget *Subtarget) {
22700 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22701 SDValue Value = MST->getValue();
22702 SDValue Mask = MST->getMask();
22703 SDLoc DL(N);
22704
22705 // If this is a UZP1 followed by a masked store, fold this into a masked
22706 // truncating store. We can do this even if this is already a masked
22707 // truncstore.
22708 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22709 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22710 Value.getValueType().isInteger()) {
22711 Value = Value.getOperand(0);
22712 if (Value.getOpcode() == ISD::BITCAST) {
22713 EVT HalfVT =
22714 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22715 EVT InVT = Value.getOperand(0).getValueType();
22716
22717 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22718 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22719 unsigned PgPattern = Mask->getConstantOperandVal(0);
22720
22721 // Ensure we can double the size of the predicate pattern
22722 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22723 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22724 MinSVESize) {
22725 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22726 PgPattern);
22727 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22728 MST->getBasePtr(), MST->getOffset(), Mask,
22729 MST->getMemoryVT(), MST->getMemOperand(),
22730 MST->getAddressingMode(),
22731 /*IsTruncating=*/true);
22732 }
22733 }
22734 }
22735 }
22736
22737 if (MST->isTruncatingStore()) {
22738 EVT ValueVT = Value->getValueType(0);
22739 EVT MemVT = MST->getMemoryVT();
22740 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22741 return SDValue();
22742 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22743 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22744 MST->getOffset(), MST->getMask(),
22745 MST->getMemoryVT(), MST->getMemOperand(),
22746 MST->getAddressingMode(), true);
22747 }
22748 }
22749
22750 return SDValue();
22751}
22752
22753/// \return true if part of the index was folded into the Base.
22754static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22755 SDLoc DL, SelectionDAG &DAG) {
22756 // This function assumes a vector of i64 indices.
22757 EVT IndexVT = Index.getValueType();
22758 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22759 return false;
22760
22761 // Simplify:
22762 // BasePtr = Ptr
22763 // Index = X + splat(Offset)
22764 // ->
22765 // BasePtr = Ptr + Offset * scale.
22766 // Index = X
22767 if (Index.getOpcode() == ISD::ADD) {
22768 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22769 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22770 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22771 Index = Index.getOperand(0);
22772 return true;
22773 }
22774 }
22775
22776 // Simplify:
22777 // BasePtr = Ptr
22778 // Index = (X + splat(Offset)) << splat(Shift)
22779 // ->
22780 // BasePtr = Ptr + (Offset << Shift) * scale)
22781 // Index = X << splat(shift)
22782 if (Index.getOpcode() == ISD::SHL &&
22783 Index.getOperand(0).getOpcode() == ISD::ADD) {
22784 SDValue Add = Index.getOperand(0);
22785 SDValue ShiftOp = Index.getOperand(1);
22786 SDValue OffsetOp = Add.getOperand(1);
22787 if (auto Shift = DAG.getSplatValue(ShiftOp))
22788 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22789 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22790 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22791 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22792 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22793 Add.getOperand(0), ShiftOp);
22794 return true;
22795 }
22796 }
22797
22798 return false;
22799}
22800
22801// Analyse the specified address returning true if a more optimal addressing
22802// mode is available. When returning true all parameters are updated to reflect
22803// their recommended values.
22805 SDValue &BasePtr, SDValue &Index,
22806 SelectionDAG &DAG) {
22807 // Try to iteratively fold parts of the index into the base pointer to
22808 // simplify the index as much as possible.
22809 bool Changed = false;
22810 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22811 Changed = true;
22812
22813 // Only consider element types that are pointer sized as smaller types can
22814 // be easily promoted.
22815 EVT IndexVT = Index.getValueType();
22816 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22817 return Changed;
22818
22819 // Can indices be trivially shrunk?
22820 EVT DataVT = N->getOperand(1).getValueType();
22821 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22822 // will later be re-extended to 64 bits in legalization
22823 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22824 return Changed;
22825 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22826 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22827 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22828 return true;
22829 }
22830
22831 // Match:
22832 // Index = step(const)
22833 int64_t Stride = 0;
22834 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22835 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22836 }
22837 // Match:
22838 // Index = step(const) << shift(const)
22839 else if (Index.getOpcode() == ISD::SHL &&
22840 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22841 SDValue RHS = Index.getOperand(1);
22842 if (auto *Shift =
22843 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22844 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22845 Stride = Step << Shift->getZExtValue();
22846 }
22847 }
22848
22849 // Return early because no supported pattern is found.
22850 if (Stride == 0)
22851 return Changed;
22852
22853 if (Stride < std::numeric_limits<int32_t>::min() ||
22854 Stride > std::numeric_limits<int32_t>::max())
22855 return Changed;
22856
22857 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22858 unsigned MaxVScale =
22860 int64_t LastElementOffset =
22861 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22862
22863 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22864 LastElementOffset > std::numeric_limits<int32_t>::max())
22865 return Changed;
22866
22867 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22868 // Stride does not scale explicitly by 'Scale', because it happens in
22869 // the gather/scatter addressing mode.
22870 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22871 return true;
22872}
22873
22876 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22877 assert(MGS && "Can only combine gather load or scatter store nodes");
22878
22879 if (!DCI.isBeforeLegalize())
22880 return SDValue();
22881
22882 SDLoc DL(MGS);
22883 SDValue Chain = MGS->getChain();
22884 SDValue Scale = MGS->getScale();
22885 SDValue Index = MGS->getIndex();
22886 SDValue Mask = MGS->getMask();
22887 SDValue BasePtr = MGS->getBasePtr();
22888 ISD::MemIndexType IndexType = MGS->getIndexType();
22889
22890 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22891 return SDValue();
22892
22893 // Here we catch such cases early and change MGATHER's IndexType to allow
22894 // the use of an Index that's more legalisation friendly.
22895 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22896 SDValue PassThru = MGT->getPassThru();
22897 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22898 return DAG.getMaskedGather(
22899 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22900 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22901 }
22902 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22903 SDValue Data = MSC->getValue();
22904 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22905 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22906 Ops, MSC->getMemOperand(), IndexType,
22907 MSC->isTruncatingStore());
22908}
22909
22910/// Target-specific DAG combine function for NEON load/store intrinsics
22911/// to merge base address updates.
22914 SelectionDAG &DAG) {
22915 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22916 return SDValue();
22917
22918 unsigned AddrOpIdx = N->getNumOperands() - 1;
22919 SDValue Addr = N->getOperand(AddrOpIdx);
22920
22921 // Search for a use of the address operand that is an increment.
22922 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22923 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22924 SDNode *User = *UI;
22925 if (User->getOpcode() != ISD::ADD ||
22926 UI.getUse().getResNo() != Addr.getResNo())
22927 continue;
22928
22929 // Check that the add is independent of the load/store. Otherwise, folding
22930 // it would create a cycle.
22933 Visited.insert(Addr.getNode());
22934 Worklist.push_back(N);
22935 Worklist.push_back(User);
22936 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22937 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22938 continue;
22939
22940 // Find the new opcode for the updating load/store.
22941 bool IsStore = false;
22942 bool IsLaneOp = false;
22943 bool IsDupOp = false;
22944 unsigned NewOpc = 0;
22945 unsigned NumVecs = 0;
22946 unsigned IntNo = N->getConstantOperandVal(1);
22947 switch (IntNo) {
22948 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22949 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22950 NumVecs = 2; break;
22951 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22952 NumVecs = 3; break;
22953 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22954 NumVecs = 4; break;
22955 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22956 NumVecs = 2; IsStore = true; break;
22957 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22958 NumVecs = 3; IsStore = true; break;
22959 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22960 NumVecs = 4; IsStore = true; break;
22961 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22962 NumVecs = 2; break;
22963 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22964 NumVecs = 3; break;
22965 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22966 NumVecs = 4; break;
22967 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22968 NumVecs = 2; IsStore = true; break;
22969 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22970 NumVecs = 3; IsStore = true; break;
22971 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22972 NumVecs = 4; IsStore = true; break;
22973 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22974 NumVecs = 2; IsDupOp = true; break;
22975 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22976 NumVecs = 3; IsDupOp = true; break;
22977 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22978 NumVecs = 4; IsDupOp = true; break;
22979 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22980 NumVecs = 2; IsLaneOp = true; break;
22981 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22982 NumVecs = 3; IsLaneOp = true; break;
22983 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22984 NumVecs = 4; IsLaneOp = true; break;
22985 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22986 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22987 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22988 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22989 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22990 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22991 }
22992
22993 EVT VecTy;
22994 if (IsStore)
22995 VecTy = N->getOperand(2).getValueType();
22996 else
22997 VecTy = N->getValueType(0);
22998
22999 // If the increment is a constant, it must match the memory ref size.
23000 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23001 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23002 uint32_t IncVal = CInc->getZExtValue();
23003 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
23004 if (IsLaneOp || IsDupOp)
23005 NumBytes /= VecTy.getVectorNumElements();
23006 if (IncVal != NumBytes)
23007 continue;
23008 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23009 }
23011 Ops.push_back(N->getOperand(0)); // Incoming chain
23012 // Load lane and store have vector list as input.
23013 if (IsLaneOp || IsStore)
23014 for (unsigned i = 2; i < AddrOpIdx; ++i)
23015 Ops.push_back(N->getOperand(i));
23016 Ops.push_back(Addr); // Base register
23017 Ops.push_back(Inc);
23018
23019 // Return Types.
23020 EVT Tys[6];
23021 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
23022 unsigned n;
23023 for (n = 0; n < NumResultVecs; ++n)
23024 Tys[n] = VecTy;
23025 Tys[n++] = MVT::i64; // Type of write back register
23026 Tys[n] = MVT::Other; // Type of the chain
23027 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
23028
23029 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
23030 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
23031 MemInt->getMemoryVT(),
23032 MemInt->getMemOperand());
23033
23034 // Update the uses.
23035 std::vector<SDValue> NewResults;
23036 for (unsigned i = 0; i < NumResultVecs; ++i) {
23037 NewResults.push_back(SDValue(UpdN.getNode(), i));
23038 }
23039 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
23040 DCI.CombineTo(N, NewResults);
23041 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
23042
23043 break;
23044 }
23045 return SDValue();
23046}
23047
23048// Checks to see if the value is the prescribed width and returns information
23049// about its extension mode.
23050static
23051bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
23052 ExtType = ISD::NON_EXTLOAD;
23053 switch(V.getNode()->getOpcode()) {
23054 default:
23055 return false;
23056 case ISD::LOAD: {
23057 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
23058 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
23059 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
23060 ExtType = LoadNode->getExtensionType();
23061 return true;
23062 }
23063 return false;
23064 }
23065 case ISD::AssertSext: {
23066 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
23067 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23068 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23069 ExtType = ISD::SEXTLOAD;
23070 return true;
23071 }
23072 return false;
23073 }
23074 case ISD::AssertZext: {
23075 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
23076 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23077 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23078 ExtType = ISD::ZEXTLOAD;
23079 return true;
23080 }
23081 return false;
23082 }
23083 case ISD::Constant:
23084 case ISD::TargetConstant: {
23085 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
23086 1LL << (width - 1);
23087 }
23088 }
23089
23090 return true;
23091}
23092
23093// This function does a whole lot of voodoo to determine if the tests are
23094// equivalent without and with a mask. Essentially what happens is that given a
23095// DAG resembling:
23096//
23097// +-------------+ +-------------+ +-------------+ +-------------+
23098// | Input | | AddConstant | | CompConstant| | CC |
23099// +-------------+ +-------------+ +-------------+ +-------------+
23100// | | | |
23101// V V | +----------+
23102// +-------------+ +----+ | |
23103// | ADD | |0xff| | |
23104// +-------------+ +----+ | |
23105// | | | |
23106// V V | |
23107// +-------------+ | |
23108// | AND | | |
23109// +-------------+ | |
23110// | | |
23111// +-----+ | |
23112// | | |
23113// V V V
23114// +-------------+
23115// | CMP |
23116// +-------------+
23117//
23118// The AND node may be safely removed for some combinations of inputs. In
23119// particular we need to take into account the extension type of the Input,
23120// the exact values of AddConstant, CompConstant, and CC, along with the nominal
23121// width of the input (this can work for any width inputs, the above graph is
23122// specific to 8 bits.
23123//
23124// The specific equations were worked out by generating output tables for each
23125// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
23126// problem was simplified by working with 4 bit inputs, which means we only
23127// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
23128// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
23129// patterns present in both extensions (0,7). For every distinct set of
23130// AddConstant and CompConstants bit patterns we can consider the masked and
23131// unmasked versions to be equivalent if the result of this function is true for
23132// all 16 distinct bit patterns of for the current extension type of Input (w0).
23133//
23134// sub w8, w0, w1
23135// and w10, w8, #0x0f
23136// cmp w8, w2
23137// cset w9, AArch64CC
23138// cmp w10, w2
23139// cset w11, AArch64CC
23140// cmp w9, w11
23141// cset w0, eq
23142// ret
23143//
23144// Since the above function shows when the outputs are equivalent it defines
23145// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
23146// would be expensive to run during compiles. The equations below were written
23147// in a test harness that confirmed they gave equivalent outputs to the above
23148// for all inputs function, so they can be used determine if the removal is
23149// legal instead.
23150//
23151// isEquivalentMaskless() is the code for testing if the AND can be removed
23152// factored out of the DAG recognition as the DAG can take several forms.
23153
23154static bool isEquivalentMaskless(unsigned CC, unsigned width,
23155 ISD::LoadExtType ExtType, int AddConstant,
23156 int CompConstant) {
23157 // By being careful about our equations and only writing the in term
23158 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
23159 // make them generally applicable to all bit widths.
23160 int MaxUInt = (1 << width);
23161
23162 // For the purposes of these comparisons sign extending the type is
23163 // equivalent to zero extending the add and displacing it by half the integer
23164 // width. Provided we are careful and make sure our equations are valid over
23165 // the whole range we can just adjust the input and avoid writing equations
23166 // for sign extended inputs.
23167 if (ExtType == ISD::SEXTLOAD)
23168 AddConstant -= (1 << (width-1));
23169
23170 switch(CC) {
23171 case AArch64CC::LE:
23172 case AArch64CC::GT:
23173 if ((AddConstant == 0) ||
23174 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
23175 (AddConstant >= 0 && CompConstant < 0) ||
23176 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
23177 return true;
23178 break;
23179 case AArch64CC::LT:
23180 case AArch64CC::GE:
23181 if ((AddConstant == 0) ||
23182 (AddConstant >= 0 && CompConstant <= 0) ||
23183 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
23184 return true;
23185 break;
23186 case AArch64CC::HI:
23187 case AArch64CC::LS:
23188 if ((AddConstant >= 0 && CompConstant < 0) ||
23189 (AddConstant <= 0 && CompConstant >= -1 &&
23190 CompConstant < AddConstant + MaxUInt))
23191 return true;
23192 break;
23193 case AArch64CC::PL:
23194 case AArch64CC::MI:
23195 if ((AddConstant == 0) ||
23196 (AddConstant > 0 && CompConstant <= 0) ||
23197 (AddConstant < 0 && CompConstant <= AddConstant))
23198 return true;
23199 break;
23200 case AArch64CC::LO:
23201 case AArch64CC::HS:
23202 if ((AddConstant >= 0 && CompConstant <= 0) ||
23203 (AddConstant <= 0 && CompConstant >= 0 &&
23204 CompConstant <= AddConstant + MaxUInt))
23205 return true;
23206 break;
23207 case AArch64CC::EQ:
23208 case AArch64CC::NE:
23209 if ((AddConstant > 0 && CompConstant < 0) ||
23210 (AddConstant < 0 && CompConstant >= 0 &&
23211 CompConstant < AddConstant + MaxUInt) ||
23212 (AddConstant >= 0 && CompConstant >= 0 &&
23213 CompConstant >= AddConstant) ||
23214 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
23215 return true;
23216 break;
23217 case AArch64CC::VS:
23218 case AArch64CC::VC:
23219 case AArch64CC::AL:
23220 case AArch64CC::NV:
23221 return true;
23222 case AArch64CC::Invalid:
23223 break;
23224 }
23225
23226 return false;
23227}
23228
23229// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
23230// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
23232 SDNode *AndNode, SelectionDAG &DAG,
23233 unsigned CCIndex, unsigned CmpIndex,
23234 unsigned CC) {
23235 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
23236 if (!SubsC)
23237 return SDValue();
23238
23239 APInt SubsAP = SubsC->getAPIntValue();
23240 if (CC == AArch64CC::HI) {
23241 if (!SubsAP.isMask())
23242 return SDValue();
23243 } else if (CC == AArch64CC::LO) {
23244 if (!SubsAP.isPowerOf2())
23245 return SDValue();
23246 } else
23247 return SDValue();
23248
23249 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
23250 if (!AndC)
23251 return SDValue();
23252
23253 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
23254
23255 SDLoc DL(N);
23256 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
23257 SDValue ANDS = DAG.getNode(
23258 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
23259 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
23260 SDValue AArch64_CC =
23262 N->getOperand(CCIndex)->getValueType(0));
23263
23264 // For now, only performCSELCombine and performBRCONDCombine call this
23265 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
23266 // operands. So just init the ops direct to simplify the code. If we have some
23267 // other case with different CCIndex, CmpIndex, we need to use for loop to
23268 // rewrite the code here.
23269 // TODO: Do we need to assert number of operand is 4 here?
23270 assert((CCIndex == 2 && CmpIndex == 3) &&
23271 "Expected CCIndex to be 2 and CmpIndex to be 3.");
23272 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
23273 ANDS.getValue(1)};
23274 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
23275}
23276
23277static
23280 SelectionDAG &DAG, unsigned CCIndex,
23281 unsigned CmpIndex) {
23282 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
23283 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
23284 unsigned CondOpcode = SubsNode->getOpcode();
23285
23286 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
23287 !SubsNode->hasOneUse())
23288 return SDValue();
23289
23290 // There is a SUBS feeding this condition. Is it fed by a mask we can
23291 // use?
23292
23293 SDNode *AndNode = SubsNode->getOperand(0).getNode();
23294 unsigned MaskBits = 0;
23295
23296 if (AndNode->getOpcode() != ISD::AND)
23297 return SDValue();
23298
23299 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
23300 CmpIndex, CC))
23301 return Val;
23302
23303 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
23304 uint32_t CNV = CN->getZExtValue();
23305 if (CNV == 255)
23306 MaskBits = 8;
23307 else if (CNV == 65535)
23308 MaskBits = 16;
23309 }
23310
23311 if (!MaskBits)
23312 return SDValue();
23313
23314 SDValue AddValue = AndNode->getOperand(0);
23315
23316 if (AddValue.getOpcode() != ISD::ADD)
23317 return SDValue();
23318
23319 // The basic dag structure is correct, grab the inputs and validate them.
23320
23321 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
23322 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
23323 SDValue SubsInputValue = SubsNode->getOperand(1);
23324
23325 // The mask is present and the provenance of all the values is a smaller type,
23326 // lets see if the mask is superfluous.
23327
23328 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
23329 !isa<ConstantSDNode>(SubsInputValue.getNode()))
23330 return SDValue();
23331
23332 ISD::LoadExtType ExtType;
23333
23334 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
23335 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
23336 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
23337 return SDValue();
23338
23339 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
23340 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
23341 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
23342 return SDValue();
23343
23344 // The AND is not necessary, remove it.
23345
23346 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
23347 SubsNode->getValueType(1));
23348 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
23349
23350 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
23351 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
23352
23353 return SDValue(N, 0);
23354}
23355
23356// Optimize compare with zero and branch.
23359 SelectionDAG &DAG) {
23361 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
23362 // will not be produced, as they are conditional branch instructions that do
23363 // not set flags.
23364 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
23365 return SDValue();
23366
23367 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
23368 N = NV.getNode();
23369 SDValue Chain = N->getOperand(0);
23370 SDValue Dest = N->getOperand(1);
23371 SDValue CCVal = N->getOperand(2);
23372 SDValue Cmp = N->getOperand(3);
23373
23374 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
23375 unsigned CC = CCVal->getAsZExtVal();
23376 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
23377 return SDValue();
23378
23379 unsigned CmpOpc = Cmp.getOpcode();
23380 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23381 return SDValue();
23382
23383 // Only attempt folding if there is only one use of the flag and no use of the
23384 // value.
23385 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
23386 return SDValue();
23387
23388 SDValue LHS = Cmp.getOperand(0);
23389 SDValue RHS = Cmp.getOperand(1);
23390
23391 assert(LHS.getValueType() == RHS.getValueType() &&
23392 "Expected the value type to be the same for both operands!");
23393 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23394 return SDValue();
23395
23396 if (isNullConstant(LHS))
23397 std::swap(LHS, RHS);
23398
23399 if (!isNullConstant(RHS))
23400 return SDValue();
23401
23402 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
23403 LHS.getOpcode() == ISD::SRL)
23404 return SDValue();
23405
23406 // Fold the compare into the branch instruction.
23407 SDValue BR;
23408 if (CC == AArch64CC::EQ)
23409 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23410 else
23411 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23412
23413 // Do not add new nodes to DAG combiner worklist.
23414 DCI.CombineTo(N, BR, false);
23415
23416 return SDValue();
23417}
23418
23420 unsigned CC = N->getConstantOperandVal(2);
23421 SDValue SUBS = N->getOperand(3);
23422 SDValue Zero, CTTZ;
23423
23424 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23425 Zero = N->getOperand(0);
23426 CTTZ = N->getOperand(1);
23427 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23428 Zero = N->getOperand(1);
23429 CTTZ = N->getOperand(0);
23430 } else
23431 return SDValue();
23432
23433 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
23434 (CTTZ.getOpcode() == ISD::TRUNCATE &&
23435 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
23436 return SDValue();
23437
23438 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23439 "Illegal type in CTTZ folding");
23440
23441 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
23442 return SDValue();
23443
23444 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23445 ? CTTZ.getOperand(0).getOperand(0)
23446 : CTTZ.getOperand(0);
23447
23448 if (X != SUBS.getOperand(0))
23449 return SDValue();
23450
23451 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23452 ? CTTZ.getOperand(0).getValueSizeInBits()
23453 : CTTZ.getValueSizeInBits();
23454 SDValue BitWidthMinusOne =
23455 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23456 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23457 BitWidthMinusOne);
23458}
23459
23460// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23461// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23462// Where x and y are constants and x != y
23463
23464// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23465// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23466// Where x and y are constants and x != y
23468 SDValue L = Op->getOperand(0);
23469 SDValue R = Op->getOperand(1);
23470 AArch64CC::CondCode OpCC =
23471 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23472
23473 SDValue OpCmp = Op->getOperand(3);
23474 if (!isCMP(OpCmp))
23475 return SDValue();
23476
23477 SDValue CmpLHS = OpCmp.getOperand(0);
23478 SDValue CmpRHS = OpCmp.getOperand(1);
23479
23480 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23481 std::swap(CmpLHS, CmpRHS);
23482 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23483 return SDValue();
23484
23485 SDValue X = CmpLHS->getOperand(0);
23486 SDValue Y = CmpLHS->getOperand(1);
23487 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23488 return SDValue();
23489 }
23490
23491 // If one of the constant is opaque constant, x,y sdnode is still different
23492 // but the real value maybe the same. So check APInt here to make sure the
23493 // code is correct.
23494 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23495 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23496 if (CX->getAPIntValue() == CY->getAPIntValue())
23497 return SDValue();
23498
23500 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23501 SDValue Cond = CmpLHS->getOperand(3);
23502
23503 if (CmpRHS == Y)
23505 else if (CmpRHS != X)
23506 return SDValue();
23507
23508 if (OpCC == AArch64CC::NE)
23510 else if (OpCC != AArch64CC::EQ)
23511 return SDValue();
23512
23513 SDLoc DL(Op);
23514 EVT VT = Op->getValueType(0);
23515
23516 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23517 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23518}
23519
23520// Optimize CSEL instructions
23523 SelectionDAG &DAG) {
23524 // CSEL x, x, cc -> x
23525 if (N->getOperand(0) == N->getOperand(1))
23526 return N->getOperand(0);
23527
23528 if (SDValue R = foldCSELOfCSEL(N, DAG))
23529 return R;
23530
23531 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23532 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23533 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23534 return Folded;
23535
23536 return performCONDCombine(N, DCI, DAG, 2, 3);
23537}
23538
23539// Try to re-use an already extended operand of a vector SetCC feeding a
23540// extended select. Doing so avoids requiring another full extension of the
23541// SET_CC result when lowering the select.
23543 EVT Op0MVT = Op->getOperand(0).getValueType();
23544 if (!Op0MVT.isVector() || Op->use_empty())
23545 return SDValue();
23546
23547 // Make sure that all uses of Op are VSELECTs with result matching types where
23548 // the result type has a larger element type than the SetCC operand.
23549 SDNode *FirstUse = *Op->use_begin();
23550 if (FirstUse->getOpcode() != ISD::VSELECT)
23551 return SDValue();
23552 EVT UseMVT = FirstUse->getValueType(0);
23553 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23554 return SDValue();
23555 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23556 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23557 }))
23558 return SDValue();
23559
23560 APInt V;
23561 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23562 return SDValue();
23563
23564 SDLoc DL(Op);
23565 SDValue Op0ExtV;
23566 SDValue Op1ExtV;
23567 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23568 // Check if the first operand of the SET_CC is already extended. If it is,
23569 // split the SET_CC and re-use the extended version of the operand.
23570 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23571 Op->getOperand(0));
23572 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23573 Op->getOperand(0));
23574 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23575 Op0ExtV = SDValue(Op0SExt, 0);
23576 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23577 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23578 Op0ExtV = SDValue(Op0ZExt, 0);
23579 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23580 } else
23581 return SDValue();
23582
23583 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23584 Op0ExtV, Op1ExtV, Op->getOperand(2));
23585}
23586
23587static SDValue
23589 SelectionDAG &DAG) {
23590 SDValue Vec = N->getOperand(0);
23591 if (DCI.isBeforeLegalize() &&
23592 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23595 SDLoc DL(N);
23596 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23597 DAG);
23598 }
23599
23600 return SDValue();
23601}
23602
23605 SelectionDAG &DAG) {
23606 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23607 SDValue LHS = N->getOperand(0);
23608 SDValue RHS = N->getOperand(1);
23609 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23610 SDLoc DL(N);
23611 EVT VT = N->getValueType(0);
23612
23613 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23614 return V;
23615
23616 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23617 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23618 LHS->getOpcode() == AArch64ISD::CSEL &&
23619 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23620 LHS->hasOneUse()) {
23621 // Invert CSEL's condition.
23622 auto OldCond =
23623 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23624 auto NewCond = getInvertedCondCode(OldCond);
23625
23626 // csel 0, 1, !cond, X
23627 SDValue CSEL =
23628 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23629 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23630 LHS.getOperand(3));
23631 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23632 }
23633
23634 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23635 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23636 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23637 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23638 LHS->hasOneUse()) {
23639 EVT TstVT = LHS->getValueType(0);
23640 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23641 // this pattern will get better opt in emitComparison
23642 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23643 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23644 DAG.getConstant(TstImm, DL, TstVT));
23645 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23646 }
23647 }
23648
23649 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23650 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23651 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23652 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23653 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23654 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23656 LHS->getOpcode() == ISD::BITCAST) {
23657 EVT ToVT = LHS->getValueType(0);
23658 EVT FromVT = LHS->getOperand(0).getValueType();
23659 if (FromVT.isFixedLengthVector() &&
23660 FromVT.getVectorElementType() == MVT::i1) {
23661 bool IsNull = isNullConstant(RHS);
23663 DL, MVT::i1, LHS->getOperand(0));
23664 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23665 LHS);
23666 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23667 }
23668 }
23669
23670 // Try to perform the memcmp when the result is tested for [in]equality with 0
23671 if (SDValue V = performOrXorChainCombine(N, DAG))
23672 return V;
23673
23674 return SDValue();
23675}
23676
23677// Replace a flag-setting operator (eg ANDS) with the generic version
23678// (eg AND) if the flag is unused.
23681 unsigned GenericOpcode) {
23682 SDLoc DL(N);
23683 SDValue LHS = N->getOperand(0);
23684 SDValue RHS = N->getOperand(1);
23685 EVT VT = N->getValueType(0);
23686
23687 // If the flag result isn't used, convert back to a generic opcode.
23688 if (!N->hasAnyUseOfValue(1)) {
23689 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23690 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23691 DL);
23692 }
23693
23694 // Combine identical generic nodes into this node, re-using the result.
23695 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23696 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23697 DCI.CombineTo(Generic, SDValue(N, 0));
23698
23699 return SDValue();
23700}
23701
23703 // setcc_merge_zero pred
23704 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23705 // => extract_subvector (inner setcc_merge_zero)
23706 SDValue Pred = N->getOperand(0);
23707 SDValue LHS = N->getOperand(1);
23708 SDValue RHS = N->getOperand(2);
23709 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23710
23711 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23712 LHS->getOpcode() != ISD::SIGN_EXTEND)
23713 return SDValue();
23714
23715 SDValue Extract = LHS->getOperand(0);
23716 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23717 Extract->getValueType(0) != N->getValueType(0) ||
23718 Extract->getConstantOperandVal(1) != 0)
23719 return SDValue();
23720
23721 SDValue InnerSetCC = Extract->getOperand(0);
23722 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23723 return SDValue();
23724
23725 // By this point we've effectively got
23726 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23727 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23728 // can operate on A directly.
23729 SDValue InnerPred = InnerSetCC.getOperand(0);
23730 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23731 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23732 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23733 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23734 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23735 return Extract;
23736
23737 return SDValue();
23738}
23739
23740static SDValue
23742 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23743 "Unexpected opcode!");
23744
23745 SelectionDAG &DAG = DCI.DAG;
23746 SDValue Pred = N->getOperand(0);
23747 SDValue LHS = N->getOperand(1);
23748 SDValue RHS = N->getOperand(2);
23749 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23750
23751 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23752 return V;
23753
23754 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23755 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23756 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23757 // setcc_merge_zero(
23758 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23759 // => setcc_merge_zero(pred, ...)
23760 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23761 LHS->getOperand(0)->getOperand(0) == Pred)
23762 return LHS->getOperand(0);
23763
23764 // setcc_merge_zero(
23765 // all_active, extend(nxvNi1 ...), != splat(0))
23766 // -> nxvNi1 ...
23767 if (isAllActivePredicate(DAG, Pred))
23768 return LHS->getOperand(0);
23769
23770 // setcc_merge_zero(
23771 // pred, extend(nxvNi1 ...), != splat(0))
23772 // -> nxvNi1 and(pred, ...)
23773 if (DCI.isAfterLegalizeDAG())
23774 // Do this after legalization to allow more folds on setcc_merge_zero
23775 // to be recognized.
23776 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23777 LHS->getOperand(0), Pred);
23778 }
23779
23780 return SDValue();
23781}
23782
23783// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23784// as well as whether the test should be inverted. This code is required to
23785// catch these cases (as opposed to standard dag combines) because
23786// AArch64ISD::TBZ is matched during legalization.
23787static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23788 SelectionDAG &DAG) {
23789
23790 if (!Op->hasOneUse())
23791 return Op;
23792
23793 // We don't handle undef/constant-fold cases below, as they should have
23794 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23795 // etc.)
23796
23797 // (tbz (trunc x), b) -> (tbz x, b)
23798 // This case is just here to enable more of the below cases to be caught.
23799 if (Op->getOpcode() == ISD::TRUNCATE &&
23800 Bit < Op->getValueType(0).getSizeInBits()) {
23801 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23802 }
23803
23804 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23805 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23806 Bit < Op->getOperand(0).getValueSizeInBits()) {
23807 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23808 }
23809
23810 if (Op->getNumOperands() != 2)
23811 return Op;
23812
23813 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23814 if (!C)
23815 return Op;
23816
23817 switch (Op->getOpcode()) {
23818 default:
23819 return Op;
23820
23821 // (tbz (and x, m), b) -> (tbz x, b)
23822 case ISD::AND:
23823 if ((C->getZExtValue() >> Bit) & 1)
23824 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23825 return Op;
23826
23827 // (tbz (shl x, c), b) -> (tbz x, b-c)
23828 case ISD::SHL:
23829 if (C->getZExtValue() <= Bit &&
23830 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23831 Bit = Bit - C->getZExtValue();
23832 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23833 }
23834 return Op;
23835
23836 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23837 case ISD::SRA:
23838 Bit = Bit + C->getZExtValue();
23839 if (Bit >= Op->getValueType(0).getSizeInBits())
23840 Bit = Op->getValueType(0).getSizeInBits() - 1;
23841 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23842
23843 // (tbz (srl x, c), b) -> (tbz x, b+c)
23844 case ISD::SRL:
23845 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23846 Bit = Bit + C->getZExtValue();
23847 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23848 }
23849 return Op;
23850
23851 // (tbz (xor x, -1), b) -> (tbnz x, b)
23852 case ISD::XOR:
23853 if ((C->getZExtValue() >> Bit) & 1)
23854 Invert = !Invert;
23855 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23856 }
23857}
23858
23859// Optimize test single bit zero/non-zero and branch.
23862 SelectionDAG &DAG) {
23863 unsigned Bit = N->getConstantOperandVal(2);
23864 bool Invert = false;
23865 SDValue TestSrc = N->getOperand(1);
23866 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23867
23868 if (TestSrc == NewTestSrc)
23869 return SDValue();
23870
23871 unsigned NewOpc = N->getOpcode();
23872 if (Invert) {
23873 if (NewOpc == AArch64ISD::TBZ)
23874 NewOpc = AArch64ISD::TBNZ;
23875 else {
23876 assert(NewOpc == AArch64ISD::TBNZ);
23877 NewOpc = AArch64ISD::TBZ;
23878 }
23879 }
23880
23881 SDLoc DL(N);
23882 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23883 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23884}
23885
23886// Swap vselect operands where it may allow a predicated operation to achieve
23887// the `sel`.
23888//
23889// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23890// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23892 auto SelectA = N->getOperand(1);
23893 auto SelectB = N->getOperand(2);
23894 auto NTy = N->getValueType(0);
23895
23896 if (!NTy.isScalableVector())
23897 return SDValue();
23898 SDValue SetCC = N->getOperand(0);
23899 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23900 return SDValue();
23901
23902 switch (SelectB.getOpcode()) {
23903 default:
23904 return SDValue();
23905 case ISD::FMUL:
23906 case ISD::FSUB:
23907 case ISD::FADD:
23908 break;
23909 }
23910 if (SelectA != SelectB.getOperand(0))
23911 return SDValue();
23912
23913 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23914 ISD::CondCode InverseCC =
23916 auto InverseSetCC =
23917 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23918 SetCC.getOperand(1), InverseCC);
23919
23920 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23921 {InverseSetCC, SelectB, SelectA});
23922}
23923
23924// vselect (v1i1 setcc) ->
23925// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23926// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23927// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23928// such VSELECT.
23930 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23931 return SwapResult;
23932
23933 SDValue N0 = N->getOperand(0);
23934 EVT CCVT = N0.getValueType();
23935
23936 if (isAllActivePredicate(DAG, N0))
23937 return N->getOperand(1);
23938
23939 if (isAllInactivePredicate(N0))
23940 return N->getOperand(2);
23941
23942 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23943 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23944 // supported types.
23945 SDValue SetCC = N->getOperand(0);
23946 if (SetCC.getOpcode() == ISD::SETCC &&
23947 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23948 SDValue CmpLHS = SetCC.getOperand(0);
23949 EVT VT = CmpLHS.getValueType();
23950 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23951 SDNode *SplatLHS = N->getOperand(1).getNode();
23952 SDNode *SplatRHS = N->getOperand(2).getNode();
23953 APInt SplatLHSVal;
23954 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23955 VT.isSimple() &&
23956 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23957 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23958 VT.getSimpleVT().SimpleTy) &&
23959 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23960 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23962 unsigned NumElts = VT.getVectorNumElements();
23964 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23965 VT.getScalarType()));
23966 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23967
23968 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23969 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23970 return Or;
23971 }
23972 }
23973
23974 EVT CmpVT = N0.getOperand(0).getValueType();
23975 if (N0.getOpcode() != ISD::SETCC ||
23977 CCVT.getVectorElementType() != MVT::i1 ||
23979 return SDValue();
23980
23981 EVT ResVT = N->getValueType(0);
23982 // Only combine when the result type is of the same size as the compared
23983 // operands.
23984 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23985 return SDValue();
23986
23987 SDValue IfTrue = N->getOperand(1);
23988 SDValue IfFalse = N->getOperand(2);
23989 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23990 N0.getOperand(0), N0.getOperand(1),
23991 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23992 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23993 IfTrue, IfFalse);
23994}
23995
23996/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23997/// the compare-mask instructions rather than going via NZCV, even if LHS and
23998/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23999/// with a vector one followed by a DUP shuffle on the result.
24002 SelectionDAG &DAG = DCI.DAG;
24003 SDValue N0 = N->getOperand(0);
24004 EVT ResVT = N->getValueType(0);
24005
24006 if (N0.getOpcode() != ISD::SETCC)
24007 return SDValue();
24008
24009 if (ResVT.isScalableVT())
24010 return SDValue();
24011
24012 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
24013 // scalar SetCCResultType. We also don't expect vectors, because we assume
24014 // that selects fed by vector SETCCs are canonicalized to VSELECT.
24015 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
24016 "Scalar-SETCC feeding SELECT has unexpected result type!");
24017
24018 // If NumMaskElts == 0, the comparison is larger than select result. The
24019 // largest real NEON comparison is 64-bits per lane, which means the result is
24020 // at most 32-bits and an illegal vector. Just bail out for now.
24021 EVT SrcVT = N0.getOperand(0).getValueType();
24022
24023 // Don't try to do this optimization when the setcc itself has i1 operands.
24024 // There are no legal vectors of i1, so this would be pointless. v1f16 is
24025 // ruled out to prevent the creation of setcc that need to be scalarized.
24026 if (SrcVT == MVT::i1 ||
24027 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
24028 return SDValue();
24029
24030 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
24031 if (!ResVT.isVector() || NumMaskElts == 0)
24032 return SDValue();
24033
24034 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
24036
24037 // Also bail out if the vector CCVT isn't the same size as ResVT.
24038 // This can happen if the SETCC operand size doesn't divide the ResVT size
24039 // (e.g., f64 vs v3f32).
24040 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
24041 return SDValue();
24042
24043 // Make sure we didn't create illegal types, if we're not supposed to.
24044 assert(DCI.isBeforeLegalize() ||
24045 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
24046
24047 // First perform a vector comparison, where lane 0 is the one we're interested
24048 // in.
24049 SDLoc DL(N0);
24050 SDValue LHS =
24051 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
24052 SDValue RHS =
24053 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
24054 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
24055
24056 // Now duplicate the comparison mask we want across all other lanes.
24057 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
24058 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
24059 Mask = DAG.getNode(ISD::BITCAST, DL,
24060 ResVT.changeVectorElementTypeToInteger(), Mask);
24061
24062 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
24063}
24064
24067 EVT VT = N->getValueType(0);
24068 SDLoc DL(N);
24069 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
24070 // 128bit vector version.
24071 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
24073 SmallVector<SDValue> Ops(N->ops());
24074 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
24075 DCI.DAG.getVTList(LVT), Ops)) {
24076 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
24077 DCI.DAG.getConstant(0, DL, MVT::i64));
24078 }
24079 }
24080
24081 if (N->getOpcode() == AArch64ISD::DUP) {
24082 if (DCI.isAfterLegalizeDAG()) {
24083 // If scalar dup's operand is extract_vector_elt, try to combine them into
24084 // duplane. For example,
24085 //
24086 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
24087 // t18: v4i32 = AArch64ISD::DUP t21
24088 // ==>
24089 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
24090 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
24091 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24092 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
24093 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
24094 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
24095 EXTRACT_VEC_ELT.getOperand(1));
24096 }
24097 }
24098 }
24099
24100 return performPostLD1Combine(N, DCI, false);
24101 }
24102
24103 return SDValue();
24104}
24105
24106/// Get rid of unnecessary NVCASTs (that don't change the type).
24108 if (N->getValueType(0) == N->getOperand(0).getValueType())
24109 return N->getOperand(0);
24110 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
24111 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
24112 N->getOperand(0).getOperand(0));
24113
24114 return SDValue();
24115}
24116
24117// If all users of the globaladdr are of the form (globaladdr + constant), find
24118// the smallest constant, fold it into the globaladdr's offset and rewrite the
24119// globaladdr as (globaladdr + constant) - constant.
24121 const AArch64Subtarget *Subtarget,
24122 const TargetMachine &TM) {
24123 auto *GN = cast<GlobalAddressSDNode>(N);
24124 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
24126 return SDValue();
24127
24128 uint64_t MinOffset = -1ull;
24129 for (SDNode *N : GN->uses()) {
24130 if (N->getOpcode() != ISD::ADD)
24131 return SDValue();
24132 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
24133 if (!C)
24134 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24135 if (!C)
24136 return SDValue();
24137 MinOffset = std::min(MinOffset, C->getZExtValue());
24138 }
24139 uint64_t Offset = MinOffset + GN->getOffset();
24140
24141 // Require that the new offset is larger than the existing one. Otherwise, we
24142 // can end up oscillating between two possible DAGs, for example,
24143 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
24144 if (Offset <= uint64_t(GN->getOffset()))
24145 return SDValue();
24146
24147 // Check whether folding this offset is legal. It must not go out of bounds of
24148 // the referenced object to avoid violating the code model, and must be
24149 // smaller than 2^20 because this is the largest offset expressible in all
24150 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
24151 // stores an immediate signed 21 bit offset.)
24152 //
24153 // This check also prevents us from folding negative offsets, which will end
24154 // up being treated in the same way as large positive ones. They could also
24155 // cause code model violations, and aren't really common enough to matter.
24156 if (Offset >= (1 << 20))
24157 return SDValue();
24158
24159 const GlobalValue *GV = GN->getGlobal();
24160 Type *T = GV->getValueType();
24161 if (!T->isSized() ||
24163 return SDValue();
24164
24165 SDLoc DL(GN);
24166 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
24167 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
24168 DAG.getConstant(MinOffset, DL, MVT::i64));
24169}
24170
24172 const AArch64Subtarget *Subtarget) {
24173 SDValue BR = N->getOperand(0);
24174 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
24175 !BR.getValueType().isScalarInteger())
24176 return SDValue();
24177
24178 SDLoc DL(N);
24179 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
24180}
24181
24182// Turns the vector of indices into a vector of byte offstes by scaling Offset
24183// by (BitWidth / 8).
24185 SDLoc DL, unsigned BitWidth) {
24186 assert(Offset.getValueType().isScalableVector() &&
24187 "This method is only for scalable vectors of offsets");
24188
24189 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
24190 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
24191
24192 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
24193}
24194
24195/// Check if the value of \p OffsetInBytes can be used as an immediate for
24196/// the gather load/prefetch and scatter store instructions with vector base and
24197/// immediate offset addressing mode:
24198///
24199/// [<Zn>.[S|D]{, #<imm>}]
24200///
24201/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24202inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
24203 unsigned ScalarSizeInBytes) {
24204 // The immediate is not a multiple of the scalar size.
24205 if (OffsetInBytes % ScalarSizeInBytes)
24206 return false;
24207
24208 // The immediate is out of range.
24209 if (OffsetInBytes / ScalarSizeInBytes > 31)
24210 return false;
24211
24212 return true;
24213}
24214
24215/// Check if the value of \p Offset represents a valid immediate for the SVE
24216/// gather load/prefetch and scatter store instructiona with vector base and
24217/// immediate offset addressing mode:
24218///
24219/// [<Zn>.[S|D]{, #<imm>}]
24220///
24221/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24223 unsigned ScalarSizeInBytes) {
24224 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
24225 return OffsetConst && isValidImmForSVEVecImmAddrMode(
24226 OffsetConst->getZExtValue(), ScalarSizeInBytes);
24227}
24228
24230 unsigned Opcode,
24231 bool OnlyPackedOffsets = true) {
24232 const SDValue Src = N->getOperand(2);
24233 const EVT SrcVT = Src->getValueType(0);
24234 assert(SrcVT.isScalableVector() &&
24235 "Scatter stores are only possible for SVE vectors");
24236
24237 SDLoc DL(N);
24238 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
24239
24240 // Make sure that source data will fit into an SVE register
24242 return SDValue();
24243
24244 // For FPs, ACLE only supports _packed_ single and double precision types.
24245 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
24246 if (SrcElVT.isFloatingPoint())
24247 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
24248 ((Opcode != AArch64ISD::SST1Q_PRED &&
24249 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
24250 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
24251 return SDValue();
24252
24253 // Depending on the addressing mode, this is either a pointer or a vector of
24254 // pointers (that fits into one register)
24255 SDValue Base = N->getOperand(4);
24256 // Depending on the addressing mode, this is either a single offset or a
24257 // vector of offsets (that fits into one register)
24258 SDValue Offset = N->getOperand(5);
24259
24260 // For "scalar + vector of indices", just scale the indices. This only
24261 // applies to non-temporal scatters because there's no instruction that takes
24262 // indices.
24263 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
24264 Offset =
24266 Opcode = AArch64ISD::SSTNT1_PRED;
24267 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
24268 Offset =
24270 Opcode = AArch64ISD::SST1Q_PRED;
24271 }
24272
24273 // In the case of non-temporal gather loads there's only one SVE instruction
24274 // per data-size: "scalar + vector", i.e.
24275 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24276 // Since we do have intrinsics that allow the arguments to be in a different
24277 // order, we may need to swap them to match the spec.
24278 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
24279 Offset.getValueType().isVector())
24281
24282 // SST1_IMM requires that the offset is an immediate that is:
24283 // * a multiple of #SizeInBytes,
24284 // * in the range [0, 31 x #SizeInBytes],
24285 // where #SizeInBytes is the size in bytes of the stored items. For
24286 // immediates outside that range and non-immediate scalar offsets use SST1 or
24287 // SST1_UXTW instead.
24288 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
24290 SrcVT.getScalarSizeInBits() / 8)) {
24291 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24293 else
24294 Opcode = AArch64ISD::SST1_PRED;
24295
24297 }
24298 }
24299
24300 auto &TLI = DAG.getTargetLoweringInfo();
24301 if (!TLI.isTypeLegal(Base.getValueType()))
24302 return SDValue();
24303
24304 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
24305 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24306 // nxv2i64. Legalize accordingly.
24307 if (!OnlyPackedOffsets &&
24308 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24309 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24310
24311 if (!TLI.isTypeLegal(Offset.getValueType()))
24312 return SDValue();
24313
24314 // Source value type that is representable in hardware
24315 EVT HwSrcVt = getSVEContainerType(SrcVT);
24316
24317 // Keep the original type of the input data to store - this is needed to be
24318 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
24319 // FP values we want the integer equivalent, so just use HwSrcVt.
24320 SDValue InputVT = DAG.getValueType(SrcVT);
24321 if (SrcVT.isFloatingPoint())
24322 InputVT = DAG.getValueType(HwSrcVt);
24323
24324 SDVTList VTs = DAG.getVTList(MVT::Other);
24325 SDValue SrcNew;
24326
24327 if (Src.getValueType().isFloatingPoint())
24328 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
24329 else
24330 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
24331
24332 SDValue Ops[] = {N->getOperand(0), // Chain
24333 SrcNew,
24334 N->getOperand(3), // Pg
24335 Base,
24336 Offset,
24337 InputVT};
24338
24339 return DAG.getNode(Opcode, DL, VTs, Ops);
24340}
24341
24343 unsigned Opcode,
24344 bool OnlyPackedOffsets = true) {
24345 const EVT RetVT = N->getValueType(0);
24346 assert(RetVT.isScalableVector() &&
24347 "Gather loads are only possible for SVE vectors");
24348
24349 SDLoc DL(N);
24350
24351 // Make sure that the loaded data will fit into an SVE register
24353 return SDValue();
24354
24355 // Depending on the addressing mode, this is either a pointer or a vector of
24356 // pointers (that fits into one register)
24357 SDValue Base = N->getOperand(3);
24358 // Depending on the addressing mode, this is either a single offset or a
24359 // vector of offsets (that fits into one register)
24360 SDValue Offset = N->getOperand(4);
24361
24362 // For "scalar + vector of indices", scale the indices to obtain unscaled
24363 // offsets. This applies to non-temporal and quadword gathers, which do not
24364 // have an addressing mode with scaled offset.
24367 RetVT.getScalarSizeInBits());
24369 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
24371 RetVT.getScalarSizeInBits());
24373 }
24374
24375 // In the case of non-temporal gather loads and quadword gather loads there's
24376 // only one addressing mode : "vector + scalar", e.g.
24377 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24378 // Since we do have intrinsics that allow the arguments to be in a different
24379 // order, we may need to swap them to match the spec.
24380 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
24381 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24382 Offset.getValueType().isVector())
24384
24385 // GLD{FF}1_IMM requires that the offset is an immediate that is:
24386 // * a multiple of #SizeInBytes,
24387 // * in the range [0, 31 x #SizeInBytes],
24388 // where #SizeInBytes is the size in bytes of the loaded items. For
24389 // immediates outside that range and non-immediate scalar offsets use
24390 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24391 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
24394 RetVT.getScalarSizeInBits() / 8)) {
24395 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24396 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24399 else
24400 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24403
24405 }
24406 }
24407
24408 auto &TLI = DAG.getTargetLoweringInfo();
24409 if (!TLI.isTypeLegal(Base.getValueType()))
24410 return SDValue();
24411
24412 // Some gather load variants allow unpacked offsets, but only as nxv2i32
24413 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24414 // nxv2i64. Legalize accordingly.
24415 if (!OnlyPackedOffsets &&
24416 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24417 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24418
24419 // Return value type that is representable in hardware
24420 EVT HwRetVt = getSVEContainerType(RetVT);
24421
24422 // Keep the original output value type around - this is needed to be able to
24423 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24424 // values we want the integer equivalent, so just use HwRetVT.
24425 SDValue OutVT = DAG.getValueType(RetVT);
24426 if (RetVT.isFloatingPoint())
24427 OutVT = DAG.getValueType(HwRetVt);
24428
24429 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
24430 SDValue Ops[] = {N->getOperand(0), // Chain
24431 N->getOperand(2), // Pg
24432 Base, Offset, OutVT};
24433
24434 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
24435 SDValue LoadChain = SDValue(Load.getNode(), 1);
24436
24437 if (RetVT.isInteger() && (RetVT != HwRetVt))
24438 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
24439
24440 // If the original return value was FP, bitcast accordingly. Doing it here
24441 // means that we can avoid adding TableGen patterns for FPs.
24442 if (RetVT.isFloatingPoint())
24443 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
24444
24445 return DAG.getMergeValues({Load, LoadChain}, DL);
24446}
24447
24448static SDValue
24450 SelectionDAG &DAG) {
24451 SDLoc DL(N);
24452 SDValue Src = N->getOperand(0);
24453 unsigned Opc = Src->getOpcode();
24454
24455 // Sign extend of an unsigned unpack -> signed unpack
24456 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24457
24458 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24460
24461 // Push the sign extend to the operand of the unpack
24462 // This is necessary where, for example, the operand of the unpack
24463 // is another unpack:
24464 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24465 // ->
24466 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24467 // ->
24468 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24469 SDValue ExtOp = Src->getOperand(0);
24470 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24471 EVT EltTy = VT.getVectorElementType();
24472 (void)EltTy;
24473
24474 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24475 "Sign extending from an invalid type");
24476
24477 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24478
24480 ExtOp, DAG.getValueType(ExtVT));
24481
24482 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24483 }
24484
24485 if (DCI.isBeforeLegalizeOps())
24486 return SDValue();
24487
24489 return SDValue();
24490
24491 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24492 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24493 unsigned NewOpc;
24494 unsigned MemVTOpNum = 4;
24495 switch (Opc) {
24498 MemVTOpNum = 3;
24499 break;
24502 MemVTOpNum = 3;
24503 break;
24506 MemVTOpNum = 3;
24507 break;
24510 break;
24513 break;
24516 break;
24519 break;
24522 break;
24525 break;
24528 break;
24531 break;
24534 break;
24537 break;
24540 break;
24543 break;
24546 break;
24549 break;
24552 break;
24553 default:
24554 return SDValue();
24555 }
24556
24557 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24558 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24559
24560 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24561 return SDValue();
24562
24563 EVT DstVT = N->getValueType(0);
24564 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24565
24567 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24568 Ops.push_back(Src->getOperand(I));
24569
24570 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24571 DCI.CombineTo(N, ExtLoad);
24572 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24573
24574 // Return N so it doesn't get rechecked
24575 return SDValue(N, 0);
24576}
24577
24578/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24579/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24580/// != nxv2i32) do not need legalization.
24582 const unsigned OffsetPos = 4;
24583 SDValue Offset = N->getOperand(OffsetPos);
24584
24585 // Not an unpacked vector, bail out.
24586 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24587 return SDValue();
24588
24589 // Extend the unpacked offset vector to 64-bit lanes.
24590 SDLoc DL(N);
24591 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24592 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24593 // Replace the offset operand with the 64-bit one.
24594 Ops[OffsetPos] = Offset;
24595
24596 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24597}
24598
24599/// Combines a node carrying the intrinsic
24600/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24601/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24602/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24603/// sve gather prefetch instruction with vector plus immediate addressing mode.
24605 unsigned ScalarSizeInBytes) {
24606 const unsigned ImmPos = 4, OffsetPos = 3;
24607 // No need to combine the node if the immediate is valid...
24608 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24609 return SDValue();
24610
24611 // ...otherwise swap the offset base with the offset...
24612 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24613 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24614 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24615 // `aarch64_sve_prfb_gather_uxtw_index`.
24616 SDLoc DL(N);
24617 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24618 MVT::i64);
24619
24620 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24621}
24622
24623// Return true if the vector operation can guarantee only the first lane of its
24624// result contains data, with all bits in other lanes set to zero.
24626 switch (Op.getOpcode()) {
24627 default:
24628 return false;
24644 return true;
24645 }
24646}
24647
24649 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24650 SDValue InsertVec = N->getOperand(0);
24651 SDValue InsertElt = N->getOperand(1);
24652 SDValue InsertIdx = N->getOperand(2);
24653
24654 // We only care about inserts into the first element...
24655 if (!isNullConstant(InsertIdx))
24656 return SDValue();
24657 // ...of a zero'd vector...
24659 return SDValue();
24660 // ...where the inserted data was previously extracted...
24661 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24662 return SDValue();
24663
24664 SDValue ExtractVec = InsertElt.getOperand(0);
24665 SDValue ExtractIdx = InsertElt.getOperand(1);
24666
24667 // ...from the first element of a vector.
24668 if (!isNullConstant(ExtractIdx))
24669 return SDValue();
24670
24671 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24672
24673 // Ensure there's no type conversion going on.
24674 if (N->getValueType(0) != ExtractVec.getValueType())
24675 return SDValue();
24676
24677 if (!isLanes1toNKnownZero(ExtractVec))
24678 return SDValue();
24679
24680 // The explicit zeroing is redundant.
24681 return ExtractVec;
24682}
24683
24684static SDValue
24687 return Res;
24688
24689 return performPostLD1Combine(N, DCI, true);
24690}
24691
24694 const AArch64Subtarget *Subtarget) {
24695 SDValue N0 = N->getOperand(0);
24696 EVT VT = N->getValueType(0);
24697
24698 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24699 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24700 return SDValue();
24701
24702 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24703 EVT EltVT = VT.getVectorElementType();
24704 return EltVT == MVT::f32 || EltVT == MVT::f64;
24705 };
24706
24707 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24708 // We purposefully don't care about legality of the nodes here as we know
24709 // they can be split down into something legal.
24710 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24711 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24712 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24713 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24714 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24715 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24716 LN0->getChain(), LN0->getBasePtr(),
24717 N0.getValueType(), LN0->getMemOperand());
24718 DCI.CombineTo(N, ExtLoad);
24719 DCI.CombineTo(
24720 N0.getNode(),
24721 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24722 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24723 ExtLoad.getValue(1));
24724 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24725 }
24726
24727 return SDValue();
24728}
24729
24731 const AArch64Subtarget *Subtarget) {
24732 EVT VT = N->getValueType(0);
24733
24734 // Don't expand for NEON, SVE2 or SME
24735 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24736 return SDValue();
24737
24738 SDLoc DL(N);
24739
24740 SDValue Mask = N->getOperand(0);
24741 SDValue In1 = N->getOperand(1);
24742 SDValue In2 = N->getOperand(2);
24743
24744 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24745 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24746 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24747 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24748}
24749
24751 EVT VT = N->getValueType(0);
24752
24753 SDValue Insert = N->getOperand(0);
24754 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24755 return SDValue();
24756
24757 if (!Insert.getOperand(0).isUndef())
24758 return SDValue();
24759
24760 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24761 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24762 if (IdxInsert != 0 || IdxDupLane != 0)
24763 return SDValue();
24764
24765 SDValue Bitcast = Insert.getOperand(1);
24766 if (Bitcast.getOpcode() != ISD::BITCAST)
24767 return SDValue();
24768
24769 SDValue Subvec = Bitcast.getOperand(0);
24770 EVT SubvecVT = Subvec.getValueType();
24771 if (!SubvecVT.is128BitVector())
24772 return SDValue();
24773 EVT NewSubvecVT =
24775
24776 SDLoc DL(N);
24777 SDValue NewInsert =
24778 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24779 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24780 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24781 NewInsert, N->getOperand(1));
24782 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24783}
24784
24785// Try to combine mull with uzp1.
24788 SelectionDAG &DAG) {
24789 if (DCI.isBeforeLegalizeOps())
24790 return SDValue();
24791
24792 SDValue LHS = N->getOperand(0);
24793 SDValue RHS = N->getOperand(1);
24794
24795 SDValue ExtractHigh;
24796 SDValue ExtractLow;
24797 SDValue TruncHigh;
24798 SDValue TruncLow;
24799 SDLoc DL(N);
24800
24801 // Check the operands are trunc and extract_high.
24803 RHS.getOpcode() == ISD::TRUNCATE) {
24804 TruncHigh = RHS;
24805 if (LHS.getOpcode() == ISD::BITCAST)
24806 ExtractHigh = LHS.getOperand(0);
24807 else
24808 ExtractHigh = LHS;
24810 LHS.getOpcode() == ISD::TRUNCATE) {
24811 TruncHigh = LHS;
24812 if (LHS.getOpcode() == ISD::BITCAST)
24813 ExtractHigh = RHS.getOperand(0);
24814 else
24815 ExtractHigh = RHS;
24816 } else
24817 return SDValue();
24818
24819 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24820 // with uzp1.
24821 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24822 SDValue TruncHighOp = TruncHigh.getOperand(0);
24823 EVT TruncHighOpVT = TruncHighOp.getValueType();
24824 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24825 DAG.isSplatValue(TruncHighOp, false))
24826 return SDValue();
24827
24828 // Check there is other extract_high with same source vector.
24829 // For example,
24830 //
24831 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24832 // t12: v4i16 = truncate t11
24833 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24834 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24835 // t16: v4i16 = truncate t15
24836 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24837 //
24838 // This dagcombine assumes the two extract_high uses same source vector in
24839 // order to detect the pair of the mull. If they have different source vector,
24840 // this code will not work.
24841 bool HasFoundMULLow = true;
24842 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24843 if (ExtractHighSrcVec->use_size() != 2)
24844 HasFoundMULLow = false;
24845
24846 // Find ExtractLow.
24847 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24848 if (User == ExtractHigh.getNode())
24849 continue;
24850
24851 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24853 HasFoundMULLow = false;
24854 break;
24855 }
24856
24857 ExtractLow.setNode(User);
24858 }
24859
24860 if (!ExtractLow || !ExtractLow->hasOneUse())
24861 HasFoundMULLow = false;
24862
24863 // Check ExtractLow's user.
24864 if (HasFoundMULLow) {
24865 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24866 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24867 HasFoundMULLow = false;
24868 } else {
24869 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24870 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24871 TruncLow = ExtractLowUser->getOperand(1);
24872 else
24873 HasFoundMULLow = false;
24874 } else {
24875 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24876 TruncLow = ExtractLowUser->getOperand(0);
24877 else
24878 HasFoundMULLow = false;
24879 }
24880 }
24881 }
24882
24883 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24884 // with uzp1.
24885 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24886 EVT TruncHighVT = TruncHigh.getValueType();
24887 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24888 SDValue TruncLowOp =
24889 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24890 EVT TruncLowOpVT = TruncLowOp.getValueType();
24891 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24892 DAG.isSplatValue(TruncLowOp, false)))
24893 return SDValue();
24894
24895 // Create uzp1, extract_high and extract_low.
24896 if (TruncHighOpVT != UZP1VT)
24897 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24898 if (TruncLowOpVT != UZP1VT)
24899 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24900
24901 SDValue UZP1 =
24902 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24903 SDValue HighIdxCst =
24904 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24905 SDValue NewTruncHigh =
24906 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24907 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24908
24909 if (HasFoundMULLow) {
24910 EVT TruncLowVT = TruncLow.getValueType();
24911 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24912 UZP1, ExtractLow.getOperand(1));
24913 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24914 }
24915
24916 return SDValue(N, 0);
24917}
24918
24921 SelectionDAG &DAG) {
24922 if (SDValue Val =
24924 return Val;
24925
24926 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24927 return Val;
24928
24929 return SDValue();
24930}
24931
24932static SDValue
24934 SelectionDAG &DAG) {
24935 // Let's do below transform.
24936 //
24937 // t34: v4i32 = AArch64ISD::UADDLV t2
24938 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24939 // t7: i64 = zero_extend t35
24940 // t20: v1i64 = scalar_to_vector t7
24941 // ==>
24942 // t34: v4i32 = AArch64ISD::UADDLV t2
24943 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24944 // t40: v1i64 = AArch64ISD::NVCAST t39
24945 if (DCI.isBeforeLegalizeOps())
24946 return SDValue();
24947
24948 EVT VT = N->getValueType(0);
24949 if (VT != MVT::v1i64)
24950 return SDValue();
24951
24952 SDValue ZEXT = N->getOperand(0);
24953 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24954 return SDValue();
24955
24956 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24957 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24958 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24959 return SDValue();
24960
24961 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24962 return SDValue();
24963
24964 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24965 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24966 UADDLV.getValueType() != MVT::v4i32 ||
24967 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24968 return SDValue();
24969
24970 // Let's generate new sequence with AArch64ISD::NVCAST.
24971 SDLoc DL(N);
24972 SDValue EXTRACT_SUBVEC =
24973 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24974 DAG.getConstant(0, DL, MVT::i64));
24975 SDValue NVCAST =
24976 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24977
24978 return NVCAST;
24979}
24980
24982 DAGCombinerInfo &DCI) const {
24983 SelectionDAG &DAG = DCI.DAG;
24984 switch (N->getOpcode()) {
24985 default:
24986 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24987 break;
24988 case ISD::VECREDUCE_AND:
24989 case ISD::VECREDUCE_OR:
24990 case ISD::VECREDUCE_XOR:
24991 return performVecReduceBitwiseCombine(N, DCI, DAG);
24992 case ISD::ADD:
24993 case ISD::SUB:
24994 return performAddSubCombine(N, DCI);
24995 case ISD::BUILD_VECTOR:
24996 return performBuildVectorCombine(N, DCI, DAG);
24997 case ISD::TRUNCATE:
24998 return performTruncateCombine(N, DAG);
24999 case AArch64ISD::ANDS:
25000 return performFlagSettingCombine(N, DCI, ISD::AND);
25001 case AArch64ISD::ADC:
25002 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
25003 return R;
25004 return foldADCToCINC(N, DAG);
25005 case AArch64ISD::SBC:
25006 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
25007 case AArch64ISD::ADCS:
25008 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
25009 return R;
25011 case AArch64ISD::SBCS:
25012 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
25013 return R;
25015 case AArch64ISD::BICi: {
25017 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
25018 APInt DemandedElts =
25019 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
25020
25022 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
25023 return SDValue();
25024
25025 break;
25026 }
25027 case ISD::XOR:
25028 return performXorCombine(N, DAG, DCI, Subtarget);
25029 case ISD::MUL:
25030 return performMulCombine(N, DAG, DCI, Subtarget);
25031 case ISD::SINT_TO_FP:
25032 case ISD::UINT_TO_FP:
25033 return performIntToFpCombine(N, DAG, Subtarget);
25034 case ISD::FP_TO_SINT:
25035 case ISD::FP_TO_UINT:
25038 return performFpToIntCombine(N, DAG, DCI, Subtarget);
25039 case ISD::OR:
25040 return performORCombine(N, DCI, Subtarget, *this);
25041 case ISD::AND:
25042 return performANDCombine(N, DCI);
25043 case ISD::FADD:
25044 return performFADDCombine(N, DCI);
25046 return performIntrinsicCombine(N, DCI, Subtarget);
25047 case ISD::ANY_EXTEND:
25048 case ISD::ZERO_EXTEND:
25049 case ISD::SIGN_EXTEND:
25050 return performExtendCombine(N, DCI, DAG);
25052 return performSignExtendInRegCombine(N, DCI, DAG);
25054 return performConcatVectorsCombine(N, DCI, DAG);
25056 return performExtractSubvectorCombine(N, DCI, DAG);
25058 return performInsertSubvectorCombine(N, DCI, DAG);
25059 case ISD::SELECT:
25060 return performSelectCombine(N, DCI);
25061 case ISD::VSELECT:
25062 return performVSelectCombine(N, DCI.DAG);
25063 case ISD::SETCC:
25064 return performSETCCCombine(N, DCI, DAG);
25065 case ISD::LOAD:
25066 return performLOADCombine(N, DCI, DAG, Subtarget);
25067 case ISD::STORE:
25068 return performSTORECombine(N, DCI, DAG, Subtarget);
25069 case ISD::MSTORE:
25070 return performMSTORECombine(N, DCI, DAG, Subtarget);
25071 case ISD::MGATHER:
25072 case ISD::MSCATTER:
25073 return performMaskedGatherScatterCombine(N, DCI, DAG);
25074 case ISD::FP_EXTEND:
25075 return performFPExtendCombine(N, DAG, DCI, Subtarget);
25076 case AArch64ISD::BRCOND:
25077 return performBRCONDCombine(N, DCI, DAG);
25078 case AArch64ISD::TBNZ:
25079 case AArch64ISD::TBZ:
25080 return performTBZCombine(N, DCI, DAG);
25081 case AArch64ISD::CSEL:
25082 return performCSELCombine(N, DCI, DAG);
25083 case AArch64ISD::DUP:
25088 return performDUPCombine(N, DCI);
25090 return performDupLane128Combine(N, DAG);
25091 case AArch64ISD::NVCAST:
25092 return performNVCASTCombine(N, DAG);
25093 case AArch64ISD::SPLICE:
25094 return performSpliceCombine(N, DAG);
25097 return performUnpackCombine(N, DAG, Subtarget);
25098 case AArch64ISD::UZP1:
25099 case AArch64ISD::UZP2:
25100 return performUzpCombine(N, DAG, Subtarget);
25102 return performSetccMergeZeroCombine(N, DCI);
25119 return performGLD1Combine(N, DAG);
25120 case AArch64ISD::VASHR:
25121 case AArch64ISD::VLSHR:
25122 return performVectorShiftCombine(N, *this, DCI);
25124 return performSunpkloCombine(N, DAG);
25125 case AArch64ISD::BSP:
25126 return performBSPExpandForSVE(N, DAG, Subtarget);
25128 return performInsertVectorEltCombine(N, DCI);
25130 return performExtractVectorEltCombine(N, DCI, Subtarget);
25131 case ISD::VECREDUCE_ADD:
25132 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
25133 case AArch64ISD::UADDV:
25134 return performUADDVCombine(N, DAG);
25135 case AArch64ISD::SMULL:
25136 case AArch64ISD::UMULL:
25137 case AArch64ISD::PMULL:
25138 return performMULLCombine(N, DCI, DAG);
25141 switch (N->getConstantOperandVal(1)) {
25142 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
25143 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
25144 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
25145 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
25146 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
25147 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
25148 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
25149 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
25150 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
25151 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
25152 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
25153 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
25154 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
25155 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
25156 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
25157 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
25159 case Intrinsic::aarch64_neon_ld2:
25160 case Intrinsic::aarch64_neon_ld3:
25161 case Intrinsic::aarch64_neon_ld4:
25162 case Intrinsic::aarch64_neon_ld1x2:
25163 case Intrinsic::aarch64_neon_ld1x3:
25164 case Intrinsic::aarch64_neon_ld1x4:
25165 case Intrinsic::aarch64_neon_ld2lane:
25166 case Intrinsic::aarch64_neon_ld3lane:
25167 case Intrinsic::aarch64_neon_ld4lane:
25168 case Intrinsic::aarch64_neon_ld2r:
25169 case Intrinsic::aarch64_neon_ld3r:
25170 case Intrinsic::aarch64_neon_ld4r:
25171 case Intrinsic::aarch64_neon_st2:
25172 case Intrinsic::aarch64_neon_st3:
25173 case Intrinsic::aarch64_neon_st4:
25174 case Intrinsic::aarch64_neon_st1x2:
25175 case Intrinsic::aarch64_neon_st1x3:
25176 case Intrinsic::aarch64_neon_st1x4:
25177 case Intrinsic::aarch64_neon_st2lane:
25178 case Intrinsic::aarch64_neon_st3lane:
25179 case Intrinsic::aarch64_neon_st4lane:
25180 return performNEONPostLDSTCombine(N, DCI, DAG);
25181 case Intrinsic::aarch64_sve_ldnt1:
25182 return performLDNT1Combine(N, DAG);
25183 case Intrinsic::aarch64_sve_ld1rq:
25184 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
25185 case Intrinsic::aarch64_sve_ld1ro:
25186 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
25187 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
25189 case Intrinsic::aarch64_sve_ldnt1_gather:
25191 case Intrinsic::aarch64_sve_ldnt1_gather_index:
25192 return performGatherLoadCombine(N, DAG,
25194 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
25196 case Intrinsic::aarch64_sve_ld1:
25198 case Intrinsic::aarch64_sve_ldnf1:
25200 case Intrinsic::aarch64_sve_ldff1:
25202 case Intrinsic::aarch64_sve_st1:
25203 return performST1Combine(N, DAG);
25204 case Intrinsic::aarch64_sve_stnt1:
25205 return performSTNT1Combine(N, DAG);
25206 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
25208 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
25210 case Intrinsic::aarch64_sve_stnt1_scatter:
25212 case Intrinsic::aarch64_sve_stnt1_scatter_index:
25214 case Intrinsic::aarch64_sve_ld1_gather:
25216 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
25217 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
25219 case Intrinsic::aarch64_sve_ld1q_gather_index:
25220 return performGatherLoadCombine(N, DAG,
25222 case Intrinsic::aarch64_sve_ld1_gather_index:
25223 return performGatherLoadCombine(N, DAG,
25225 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
25227 /*OnlyPackedOffsets=*/false);
25228 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
25230 /*OnlyPackedOffsets=*/false);
25231 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
25232 return performGatherLoadCombine(N, DAG,
25234 /*OnlyPackedOffsets=*/false);
25235 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
25236 return performGatherLoadCombine(N, DAG,
25238 /*OnlyPackedOffsets=*/false);
25239 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
25241 case Intrinsic::aarch64_sve_ldff1_gather:
25243 case Intrinsic::aarch64_sve_ldff1_gather_index:
25244 return performGatherLoadCombine(N, DAG,
25246 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
25247 return performGatherLoadCombine(N, DAG,
25249 /*OnlyPackedOffsets=*/false);
25250 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
25251 return performGatherLoadCombine(N, DAG,
25253 /*OnlyPackedOffsets=*/false);
25254 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
25255 return performGatherLoadCombine(N, DAG,
25257 /*OnlyPackedOffsets=*/false);
25258 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
25259 return performGatherLoadCombine(N, DAG,
25261 /*OnlyPackedOffsets=*/false);
25262 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
25263 return performGatherLoadCombine(N, DAG,
25265 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
25266 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
25268 case Intrinsic::aarch64_sve_st1q_scatter_index:
25270 case Intrinsic::aarch64_sve_st1_scatter:
25272 case Intrinsic::aarch64_sve_st1_scatter_index:
25274 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
25276 /*OnlyPackedOffsets=*/false);
25277 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
25279 /*OnlyPackedOffsets=*/false);
25280 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
25281 return performScatterStoreCombine(N, DAG,
25283 /*OnlyPackedOffsets=*/false);
25284 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
25285 return performScatterStoreCombine(N, DAG,
25287 /*OnlyPackedOffsets=*/false);
25288 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
25290 case Intrinsic::aarch64_rndr:
25291 case Intrinsic::aarch64_rndrrs: {
25292 unsigned IntrinsicID = N->getConstantOperandVal(1);
25293 auto Register =
25294 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
25295 : AArch64SysReg::RNDRRS);
25296 SDLoc DL(N);
25297 SDValue A = DAG.getNode(
25298 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
25299 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
25300 SDValue B = DAG.getNode(
25301 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
25302 DAG.getConstant(0, DL, MVT::i32),
25303 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
25304 return DAG.getMergeValues(
25305 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
25306 }
25307 case Intrinsic::aarch64_sme_ldr_zt:
25309 DAG.getVTList(MVT::Other), N->getOperand(0),
25310 N->getOperand(2), N->getOperand(3));
25311 case Intrinsic::aarch64_sme_str_zt:
25312 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
25313 DAG.getVTList(MVT::Other), N->getOperand(0),
25314 N->getOperand(2), N->getOperand(3));
25315 default:
25316 break;
25317 }
25318 break;
25319 case ISD::GlobalAddress:
25320 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
25321 case ISD::CTLZ:
25322 return performCTLZCombine(N, DAG, Subtarget);
25324 return performScalarToVectorCombine(N, DCI, DAG);
25325 }
25326 return SDValue();
25327}
25328
25329// Check if the return value is used as only a return value, as otherwise
25330// we can't perform a tail-call. In particular, we need to check for
25331// target ISD nodes that are returns and any other "odd" constructs
25332// that the generic analysis code won't necessarily catch.
25333bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
25334 SDValue &Chain) const {
25335 if (N->getNumValues() != 1)
25336 return false;
25337 if (!N->hasNUsesOfValue(1, 0))
25338 return false;
25339
25340 SDValue TCChain = Chain;
25341 SDNode *Copy = *N->use_begin();
25342 if (Copy->getOpcode() == ISD::CopyToReg) {
25343 // If the copy has a glue operand, we conservatively assume it isn't safe to
25344 // perform a tail call.
25345 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
25346 MVT::Glue)
25347 return false;
25348 TCChain = Copy->getOperand(0);
25349 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
25350 return false;
25351
25352 bool HasRet = false;
25353 for (SDNode *Node : Copy->uses()) {
25354 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
25355 return false;
25356 HasRet = true;
25357 }
25358
25359 if (!HasRet)
25360 return false;
25361
25362 Chain = TCChain;
25363 return true;
25364}
25365
25366// Return whether the an instruction can potentially be optimized to a tail
25367// call. This will cause the optimizers to attempt to move, or duplicate,
25368// return instructions to help enable tail call optimizations for this
25369// instruction.
25370bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
25371 return CI->isTailCall();
25372}
25373
25374bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
25375 Register Offset, bool IsPre,
25376 MachineRegisterInfo &MRI) const {
25377 auto CstOffset = getIConstantVRegVal(Offset, MRI);
25378 if (!CstOffset || CstOffset->isZero())
25379 return false;
25380
25381 // All of the indexed addressing mode instructions take a signed 9 bit
25382 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25383 // encodes the sign/indexing direction.
25384 return isInt<9>(CstOffset->getSExtValue());
25385}
25386
25387bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
25388 SDValue &Base,
25389 SDValue &Offset,
25390 SelectionDAG &DAG) const {
25391 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25392 return false;
25393
25394 // Non-null if there is exactly one user of the loaded value (ignoring chain).
25395 SDNode *ValOnlyUser = nullptr;
25396 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25397 ++UI) {
25398 if (UI.getUse().getResNo() == 1)
25399 continue; // Ignore chain.
25400 if (ValOnlyUser == nullptr)
25401 ValOnlyUser = *UI;
25402 else {
25403 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25404 break;
25405 }
25406 }
25407
25408 auto IsUndefOrZero = [](SDValue V) {
25409 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
25410 };
25411
25412 // If the only user of the value is a scalable vector splat, it is
25413 // preferable to do a replicating load (ld1r*).
25414 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
25415 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25416 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25417 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
25418 return false;
25419
25420 Base = Op->getOperand(0);
25421 // All of the indexed addressing mode instructions take a signed
25422 // 9 bit immediate offset.
25423 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
25424 int64_t RHSC = RHS->getSExtValue();
25425 if (Op->getOpcode() == ISD::SUB)
25426 RHSC = -(uint64_t)RHSC;
25427 if (!isInt<9>(RHSC))
25428 return false;
25429 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25430 // when dealing with subtraction.
25431 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25432 return true;
25433 }
25434 return false;
25435}
25436
25437bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25438 SDValue &Offset,
25440 SelectionDAG &DAG) const {
25441 EVT VT;
25442 SDValue Ptr;
25443 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25444 VT = LD->getMemoryVT();
25445 Ptr = LD->getBasePtr();
25446 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25447 VT = ST->getMemoryVT();
25448 Ptr = ST->getBasePtr();
25449 } else
25450 return false;
25451
25452 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25453 return false;
25454 AM = ISD::PRE_INC;
25455 return true;
25456}
25457
25458bool AArch64TargetLowering::getPostIndexedAddressParts(
25460 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25461 EVT VT;
25462 SDValue Ptr;
25463 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25464 VT = LD->getMemoryVT();
25465 Ptr = LD->getBasePtr();
25466 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25467 VT = ST->getMemoryVT();
25468 Ptr = ST->getBasePtr();
25469 } else
25470 return false;
25471
25472 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25473 return false;
25474 // Post-indexing updates the base, so it's not a valid transform
25475 // if that's not the same as the load's pointer.
25476 if (Ptr != Base)
25477 return false;
25478 AM = ISD::POST_INC;
25479 return true;
25480}
25481
25484 SelectionDAG &DAG) {
25485 SDLoc DL(N);
25486 SDValue Op = N->getOperand(0);
25487 EVT VT = N->getValueType(0);
25488 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25489 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25490 "Must be bool vector.");
25491
25492 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25493 // elements, it adds a vector concatenation with undef(s). If we encounter
25494 // this here, we can skip the concat.
25495 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25496 bool AllUndef = true;
25497 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25498 AllUndef &= Op.getOperand(I).isUndef();
25499
25500 if (AllUndef)
25501 Op = Op.getOperand(0);
25502 }
25503
25504 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25505 if (VectorBits)
25506 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25507}
25508
25511 SelectionDAG &DAG, EVT ExtendVT,
25512 EVT CastVT) {
25513 SDLoc DL(N);
25514 SDValue Op = N->getOperand(0);
25515 EVT VT = N->getValueType(0);
25516
25517 // Use SCALAR_TO_VECTOR for lane zero
25518 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25519 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25520 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25521 Results.push_back(
25522 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25523}
25524
25525void AArch64TargetLowering::ReplaceBITCASTResults(
25527 SDLoc DL(N);
25528 SDValue Op = N->getOperand(0);
25529 EVT VT = N->getValueType(0);
25530 EVT SrcVT = Op.getValueType();
25531
25532 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25533 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25534 return;
25535 }
25536
25537 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25538 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25539 return;
25540 }
25541
25542 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25543 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25544 return;
25545 }
25546
25547 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25548 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25549 "Expected fp->int bitcast!");
25550
25551 // Bitcasting between unpacked vector types of different element counts is
25552 // not a NOP because the live elements are laid out differently.
25553 // 01234567
25554 // e.g. nxv2i32 = XX??XX??
25555 // nxv4f16 = X?X?X?X?
25556 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25557 return;
25558
25559 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25560 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25561 return;
25562 }
25563
25564 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25565 !VT.isVector())
25566 return replaceBoolVectorBitcast(N, Results, DAG);
25567
25568 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25569 return;
25570
25571 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25572 DAG.getUNDEF(MVT::i32), Op);
25573 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25574 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25575}
25576
25578 SelectionDAG &DAG,
25579 const AArch64Subtarget *Subtarget) {
25580 EVT VT = N->getValueType(0);
25581 if (!VT.is256BitVector() ||
25583 !N->getFlags().hasAllowReassociation()) ||
25584 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25585 VT.getScalarType() == MVT::bf16)
25586 return;
25587
25588 SDValue X = N->getOperand(0);
25589 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25590 if (!Shuf) {
25591 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25592 X = N->getOperand(1);
25593 if (!Shuf)
25594 return;
25595 }
25596
25597 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25598 return;
25599
25600 // Check the mask is 1,0,3,2,5,4,...
25601 ArrayRef<int> Mask = Shuf->getMask();
25602 for (int I = 0, E = Mask.size(); I < E; I++)
25603 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25604 return;
25605
25606 SDLoc DL(N);
25607 auto LoHi = DAG.SplitVector(X, DL);
25608 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25609 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25610 LoHi.first, LoHi.second);
25611
25612 // Shuffle the elements back into order.
25613 SmallVector<int> NMask;
25614 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25615 NMask.push_back(I);
25616 NMask.push_back(I);
25617 }
25618 Results.push_back(
25619 DAG.getVectorShuffle(VT, DL,
25620 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25621 DAG.getUNDEF(LoHi.first.getValueType())),
25622 DAG.getUNDEF(VT), NMask));
25623}
25624
25627 SelectionDAG &DAG, unsigned InterOp,
25628 unsigned AcrossOp) {
25629 EVT LoVT, HiVT;
25630 SDValue Lo, Hi;
25631 SDLoc dl(N);
25632 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25633 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25634 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25635 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25636 Results.push_back(SplitVal);
25637}
25638
25639void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25641 SDValue In = N->getOperand(0);
25642 EVT InVT = In.getValueType();
25643
25644 // Common code will handle these just fine.
25645 if (!InVT.isScalableVector() || !InVT.isInteger())
25646 return;
25647
25648 SDLoc DL(N);
25649 EVT VT = N->getValueType(0);
25650
25651 // The following checks bail if this is not a halving operation.
25652
25654
25655 if (InVT.getVectorElementCount() != (ResEC * 2))
25656 return;
25657
25658 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25659 if (!CIndex)
25660 return;
25661
25662 unsigned Index = CIndex->getZExtValue();
25663 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25664 return;
25665
25666 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25667 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25668
25669 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25670 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25671}
25672
25673// Create an even/odd pair of X registers holding integer value V.
25675 SDLoc dl(V.getNode());
25676 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25677 if (DAG.getDataLayout().isBigEndian())
25678 std::swap (VLo, VHi);
25679 SDValue RegClass =
25680 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25681 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25682 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25683 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25684 return SDValue(
25685 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25686}
25687
25690 SelectionDAG &DAG,
25691 const AArch64Subtarget *Subtarget) {
25692 assert(N->getValueType(0) == MVT::i128 &&
25693 "AtomicCmpSwap on types less than 128 should be legal");
25694
25695 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25696 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25697 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25698 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25699 SDValue Ops[] = {
25700 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25701 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25702 N->getOperand(1), // Ptr
25703 N->getOperand(0), // Chain in
25704 };
25705
25706 unsigned Opcode;
25707 switch (MemOp->getMergedOrdering()) {
25709 Opcode = AArch64::CASPX;
25710 break;
25712 Opcode = AArch64::CASPAX;
25713 break;
25715 Opcode = AArch64::CASPLX;
25716 break;
25719 Opcode = AArch64::CASPALX;
25720 break;
25721 default:
25722 llvm_unreachable("Unexpected ordering!");
25723 }
25724
25725 MachineSDNode *CmpSwap = DAG.getMachineNode(
25726 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25727 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25728
25729 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25730 if (DAG.getDataLayout().isBigEndian())
25731 std::swap(SubReg1, SubReg2);
25732 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25733 SDValue(CmpSwap, 0));
25734 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25735 SDValue(CmpSwap, 0));
25736 Results.push_back(
25737 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25738 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25739 return;
25740 }
25741
25742 unsigned Opcode;
25743 switch (MemOp->getMergedOrdering()) {
25745 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25746 break;
25748 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25749 break;
25751 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25752 break;
25755 Opcode = AArch64::CMP_SWAP_128;
25756 break;
25757 default:
25758 llvm_unreachable("Unexpected ordering!");
25759 }
25760
25761 SDLoc DL(N);
25762 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25763 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25764 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25765 New.first, New.second, N->getOperand(0)};
25766 SDNode *CmpSwap = DAG.getMachineNode(
25767 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25768 Ops);
25769 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25770
25771 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25772 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25773 Results.push_back(SDValue(CmpSwap, 3));
25774}
25775
25776static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25777 AtomicOrdering Ordering) {
25778 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25779 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25780 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25781 // ATOMIC_LOAD_CLR at any point.
25782 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25783 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25784 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25785 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25786
25787 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25788 // The operand will need to be XORed in a separate step.
25789 switch (Ordering) {
25791 return AArch64::LDCLRP;
25792 break;
25794 return AArch64::LDCLRPA;
25795 break;
25797 return AArch64::LDCLRPL;
25798 break;
25801 return AArch64::LDCLRPAL;
25802 break;
25803 default:
25804 llvm_unreachable("Unexpected ordering!");
25805 }
25806 }
25807
25808 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25809 switch (Ordering) {
25811 return AArch64::LDSETP;
25812 break;
25814 return AArch64::LDSETPA;
25815 break;
25817 return AArch64::LDSETPL;
25818 break;
25821 return AArch64::LDSETPAL;
25822 break;
25823 default:
25824 llvm_unreachable("Unexpected ordering!");
25825 }
25826 }
25827
25828 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25829 switch (Ordering) {
25831 return AArch64::SWPP;
25832 break;
25834 return AArch64::SWPPA;
25835 break;
25837 return AArch64::SWPPL;
25838 break;
25841 return AArch64::SWPPAL;
25842 break;
25843 default:
25844 llvm_unreachable("Unexpected ordering!");
25845 }
25846 }
25847
25848 llvm_unreachable("Unexpected ISDOpcode!");
25849}
25850
25853 SelectionDAG &DAG,
25854 const AArch64Subtarget *Subtarget) {
25855 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25856 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25857 // rather than the CASP instructions, because CASP has register classes for
25858 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25859 // to present them as single operands. LSE128 instructions use the GPR64
25860 // register class (because the pair does not have to be sequential), like
25861 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25862
25863 assert(N->getValueType(0) == MVT::i128 &&
25864 "AtomicLoadXXX on types less than 128 should be legal");
25865
25866 if (!Subtarget->hasLSE128())
25867 return;
25868
25869 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25870 const SDValue &Chain = N->getOperand(0);
25871 const SDValue &Ptr = N->getOperand(1);
25872 const SDValue &Val128 = N->getOperand(2);
25873 std::pair<SDValue, SDValue> Val2x64 =
25874 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25875
25876 const unsigned ISDOpcode = N->getOpcode();
25877 const unsigned MachineOpcode =
25878 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25879
25880 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25881 SDLoc dl(Val128);
25882 Val2x64.first =
25883 DAG.getNode(ISD::XOR, dl, MVT::i64,
25884 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25885 Val2x64.second =
25886 DAG.getNode(ISD::XOR, dl, MVT::i64,
25887 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25888 }
25889
25890 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25891 if (DAG.getDataLayout().isBigEndian())
25892 std::swap(Ops[0], Ops[1]);
25893
25894 MachineSDNode *AtomicInst =
25895 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25896 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25897
25898 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25899
25900 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25901 if (DAG.getDataLayout().isBigEndian())
25902 std::swap(Lo, Hi);
25903
25904 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25905 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25906}
25907
25908void AArch64TargetLowering::ReplaceNodeResults(
25910 switch (N->getOpcode()) {
25911 default:
25912 llvm_unreachable("Don't know how to custom expand this");
25913 case ISD::BITCAST:
25914 ReplaceBITCASTResults(N, Results, DAG);
25915 return;
25916 case ISD::VECREDUCE_ADD:
25921 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25922 return;
25923 case ISD::ADD:
25924 case ISD::FADD:
25925 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25926 return;
25927
25928 case ISD::CTPOP:
25929 case ISD::PARITY:
25930 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25931 Results.push_back(Result);
25932 return;
25933 case AArch64ISD::SADDV:
25935 return;
25936 case AArch64ISD::UADDV:
25938 return;
25939 case AArch64ISD::SMINV:
25941 return;
25942 case AArch64ISD::UMINV:
25944 return;
25945 case AArch64ISD::SMAXV:
25947 return;
25948 case AArch64ISD::UMAXV:
25950 return;
25951 case ISD::MULHS:
25953 Results.push_back(
25954 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25955 return;
25956 case ISD::MULHU:
25958 Results.push_back(
25959 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25960 return;
25961 case ISD::FP_TO_UINT:
25962 case ISD::FP_TO_SINT:
25965 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25966 // Let normal code take care of it by not adding anything to Results.
25967 return;
25969 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25970 return;
25972 assert(N->getValueType(0) != MVT::i128 &&
25973 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25974 break;
25977 case ISD::ATOMIC_SWAP: {
25978 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25979 "Expected 128-bit atomicrmw.");
25980 // These need custom type legalisation so we go directly to instruction.
25981 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25982 return;
25983 }
25984 case ISD::ATOMIC_LOAD:
25985 case ISD::LOAD: {
25986 MemSDNode *LoadNode = cast<MemSDNode>(N);
25987 EVT MemVT = LoadNode->getMemoryVT();
25988 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25989 // targets.
25990 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25991 MemVT.getSizeInBits() == 256u &&
25992 (MemVT.getScalarSizeInBits() == 8u ||
25993 MemVT.getScalarSizeInBits() == 16u ||
25994 MemVT.getScalarSizeInBits() == 32u ||
25995 MemVT.getScalarSizeInBits() == 64u)) {
25996
25999 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
26000 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
26001 MVT::Other}),
26002 {LoadNode->getChain(), LoadNode->getBasePtr()},
26003 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
26004
26005 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
26006 Result.getValue(0), Result.getValue(1));
26007 Results.append({Pair, Result.getValue(2) /* Chain */});
26008 return;
26009 }
26010
26011 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
26012 LoadNode->getMemoryVT() != MVT::i128) {
26013 // Non-volatile or atomic loads are optimized later in AArch64's load/store
26014 // optimizer.
26015 return;
26016 }
26017
26018 if (SDValue(N, 0).getValueType() == MVT::i128) {
26019 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
26020 bool isLoadAcquire =
26022 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
26023
26024 if (isLoadAcquire)
26025 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
26026
26028 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
26029 {LoadNode->getChain(), LoadNode->getBasePtr()},
26030 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
26031
26032 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
26033
26034 SDValue Pair =
26035 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
26036 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
26037 Results.append({Pair, Result.getValue(2) /* Chain */});
26038 }
26039 return;
26040 }
26042 ReplaceExtractSubVectorResults(N, Results, DAG);
26043 return;
26046 // Custom lowering has been requested for INSERT_SUBVECTOR and
26047 // CONCAT_VECTORS -- but delegate to common code for result type
26048 // legalisation
26049 return;
26051 EVT VT = N->getValueType(0);
26052
26053 Intrinsic::ID IntID =
26054 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
26055 switch (IntID) {
26056 default:
26057 return;
26058 case Intrinsic::aarch64_sve_clasta_n: {
26059 assert((VT == MVT::i8 || VT == MVT::i16) &&
26060 "custom lowering for unexpected type");
26061 SDLoc DL(N);
26062 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
26063 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
26064 N->getOperand(1), Op2, N->getOperand(3));
26065 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26066 return;
26067 }
26068 case Intrinsic::aarch64_sve_clastb_n: {
26069 assert((VT == MVT::i8 || VT == MVT::i16) &&
26070 "custom lowering for unexpected type");
26071 SDLoc DL(N);
26072 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
26073 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
26074 N->getOperand(1), Op2, N->getOperand(3));
26075 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26076 return;
26077 }
26078 case Intrinsic::aarch64_sve_lasta: {
26079 assert((VT == MVT::i8 || VT == MVT::i16) &&
26080 "custom lowering for unexpected type");
26081 SDLoc DL(N);
26082 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
26083 N->getOperand(1), N->getOperand(2));
26084 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26085 return;
26086 }
26087 case Intrinsic::aarch64_sve_lastb: {
26088 assert((VT == MVT::i8 || VT == MVT::i16) &&
26089 "custom lowering for unexpected type");
26090 SDLoc DL(N);
26091 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
26092 N->getOperand(1), N->getOperand(2));
26093 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26094 return;
26095 }
26096 case Intrinsic::get_active_lane_mask: {
26097 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
26098 return;
26099
26100 // NOTE: Only trivial type promotion is supported.
26101 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
26102 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
26103 return;
26104
26105 SDLoc DL(N);
26106 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
26107 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26108 return;
26109 }
26110 }
26111 }
26112 case ISD::READ_REGISTER: {
26113 SDLoc DL(N);
26114 assert(N->getValueType(0) == MVT::i128 &&
26115 "READ_REGISTER custom lowering is only for 128-bit sysregs");
26116 SDValue Chain = N->getOperand(0);
26117 SDValue SysRegName = N->getOperand(1);
26118
26119 SDValue Result = DAG.getNode(
26120 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
26121 Chain, SysRegName);
26122
26123 // Sysregs are not endian. Result.getValue(0) always contains the lower half
26124 // of the 128-bit System Register value.
26125 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
26126 Result.getValue(0), Result.getValue(1));
26127 Results.push_back(Pair);
26128 Results.push_back(Result.getValue(2)); // Chain
26129 return;
26130 }
26131 }
26132}
26133
26135 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
26137 return true;
26138}
26139
26140unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
26141 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
26142 // reciprocal if there are three or more FDIVs.
26143 return 3;
26144}
26145
26148 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
26149 // v4i16, v2i32 instead of to promote.
26150 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
26151 VT == MVT::v1f32)
26152 return TypeWidenVector;
26153
26155}
26156
26157// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
26158// provided the address is 16-byte aligned.
26160 if (!Subtarget->hasLSE2())
26161 return false;
26162
26163 if (auto LI = dyn_cast<LoadInst>(I))
26164 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26165 LI->getAlign() >= Align(16);
26166
26167 if (auto SI = dyn_cast<StoreInst>(I))
26168 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26169 SI->getAlign() >= Align(16);
26170
26171 return false;
26172}
26173
26175 if (!Subtarget->hasLSE128())
26176 return false;
26177
26178 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
26179 // will clobber the two registers.
26180 if (const auto *SI = dyn_cast<StoreInst>(I))
26181 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26182 SI->getAlign() >= Align(16) &&
26183 (SI->getOrdering() == AtomicOrdering::Release ||
26184 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
26185
26186 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
26187 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26188 RMW->getAlign() >= Align(16) &&
26189 (RMW->getOperation() == AtomicRMWInst::Xchg ||
26190 RMW->getOperation() == AtomicRMWInst::And ||
26191 RMW->getOperation() == AtomicRMWInst::Or);
26192
26193 return false;
26194}
26195
26197 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
26198 return false;
26199
26200 if (auto LI = dyn_cast<LoadInst>(I))
26201 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26202 LI->getAlign() >= Align(16) &&
26203 LI->getOrdering() == AtomicOrdering::Acquire;
26204
26205 if (auto SI = dyn_cast<StoreInst>(I))
26206 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26207 SI->getAlign() >= Align(16) &&
26208 SI->getOrdering() == AtomicOrdering::Release;
26209
26210 return false;
26211}
26212
26214 const Instruction *I) const {
26216 return false;
26218 return false;
26220 return true;
26221 return false;
26222}
26223
26225 const Instruction *I) const {
26226 // Store-Release instructions only provide seq_cst guarantees when paired with
26227 // Load-Acquire instructions. MSVC CRT does not use these instructions to
26228 // implement seq_cst loads and stores, so we need additional explicit fences
26229 // after memory writes.
26230 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26231 return false;
26232
26233 switch (I->getOpcode()) {
26234 default:
26235 return false;
26236 case Instruction::AtomicCmpXchg:
26237 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
26239 case Instruction::AtomicRMW:
26240 return cast<AtomicRMWInst>(I)->getOrdering() ==
26242 case Instruction::Store:
26243 return cast<StoreInst>(I)->getOrdering() ==
26245 }
26246}
26247
26248// Loads and stores less than 128-bits are already atomic; ones above that
26249// are doomed anyway, so defer to the default libcall and blame the OS when
26250// things go wrong.
26253 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
26254 if (Size != 128)
26256 if (isOpSuitableForRCPC3(SI))
26258 if (isOpSuitableForLSE128(SI))
26260 if (isOpSuitableForLDPSTP(SI))
26263}
26264
26265// Loads and stores less than 128-bits are already atomic; ones above that
26266// are doomed anyway, so defer to the default libcall and blame the OS when
26267// things go wrong.
26270 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
26271
26272 if (Size != 128)
26274 if (isOpSuitableForRCPC3(LI))
26276 // No LSE128 loads
26277 if (isOpSuitableForLDPSTP(LI))
26279
26280 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26281 // implement atomicrmw without spilling. If the target address is also on the
26282 // stack and close enough to the spill slot, this can lead to a situation
26283 // where the monitor always gets cleared and the atomic operation can never
26284 // succeed. So at -O0 lower this operation to a CAS loop.
26285 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26287
26288 // Using CAS for an atomic load has a better chance of succeeding under high
26289 // contention situations. So use it if available.
26290 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
26292}
26293
26294// The "default" for integer RMW operations is to expand to an LL/SC loop.
26295// However, with the LSE instructions (or outline-atomics mode, which provides
26296// library routines in place of the LSE-instructions), we can directly emit many
26297// operations instead.
26298//
26299// Floating-point operations are always emitted to a cmpxchg loop, because they
26300// may trigger a trap which aborts an LLSC sequence.
26303 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
26304 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
26305
26306 if (AI->isFloatingPointOperation())
26308
26309 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
26313 if (CanUseLSE128)
26315
26316 // Nand is not supported in LSE.
26317 // Leave 128 bits to LLSC or CmpXChg.
26318 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
26319 if (Subtarget->hasLSE())
26321 if (Subtarget->outlineAtomics()) {
26322 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
26323 // Don't outline them unless
26324 // (1) high level <atomic> support approved:
26325 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
26326 // (2) low level libgcc and compiler-rt support implemented by:
26327 // min/max outline atomics helpers
26328 if (AI->getOperation() != AtomicRMWInst::Min &&
26333 }
26334 }
26335 }
26336
26337 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26338 // implement atomicrmw without spilling. If the target address is also on the
26339 // stack and close enough to the spill slot, this can lead to a situation
26340 // where the monitor always gets cleared and the atomic operation can never
26341 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
26342 // we have a single CAS instruction that can replace the loop.
26344 Subtarget->hasLSE())
26346
26348}
26349
26352 AtomicCmpXchgInst *AI) const {
26353 // If subtarget has LSE, leave cmpxchg intact for codegen.
26354 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
26356 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26357 // implement cmpxchg without spilling. If the address being exchanged is also
26358 // on the stack and close enough to the spill slot, this can lead to a
26359 // situation where the monitor always gets cleared and the atomic operation
26360 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
26361 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26363
26364 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
26365 // it.
26367 if (Size > 64)
26369
26371}
26372
26374 Type *ValueTy, Value *Addr,
26375 AtomicOrdering Ord) const {
26376 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26377 bool IsAcquire = isAcquireOrStronger(Ord);
26378
26379 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26380 // intrinsic must return {i64, i64} and we have to recombine them into a
26381 // single i128 here.
26382 if (ValueTy->getPrimitiveSizeInBits() == 128) {
26384 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26386
26387 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
26388
26389 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
26390 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
26391 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
26392 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
26393 return Builder.CreateOr(
26394 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
26395 }
26396
26397 Type *Tys[] = { Addr->getType() };
26399 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26400 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
26401
26402 const DataLayout &DL = M->getDataLayout();
26403 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
26404 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
26405 CI->addParamAttr(
26406 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
26407 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
26408
26409 return Builder.CreateBitCast(Trunc, ValueTy);
26410}
26411
26413 IRBuilderBase &Builder) const {
26414 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26415 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
26416}
26417
26419 Value *Val, Value *Addr,
26420 AtomicOrdering Ord) const {
26421 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26422 bool IsRelease = isReleaseOrStronger(Ord);
26423
26424 // Since the intrinsics must have legal type, the i128 intrinsics take two
26425 // parameters: "i64, i64". We must marshal Val into the appropriate form
26426 // before the call.
26427 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26429 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26431 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26432
26433 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26434 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26435 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26436 }
26437
26439 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26440 Type *Tys[] = { Addr->getType() };
26441 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26442
26443 const DataLayout &DL = M->getDataLayout();
26444 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26445 Val = Builder.CreateBitCast(Val, IntValTy);
26446
26447 CallInst *CI = Builder.CreateCall(
26448 Stxr, {Builder.CreateZExtOrBitCast(
26449 Val, Stxr->getFunctionType()->getParamType(0)),
26450 Addr});
26451 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26452 Attribute::ElementType, Val->getType()));
26453 return CI;
26454}
26455
26457 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26458 const DataLayout &DL) const {
26459 if (!Ty->isArrayTy()) {
26460 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26461 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26462 }
26463
26464 // All non aggregate members of the type must have the same type
26465 SmallVector<EVT> ValueVTs;
26466 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26467 return all_equal(ValueVTs);
26468}
26469
26470bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26471 EVT) const {
26472 return false;
26473}
26474
26475static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26476 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26477 Function *ThreadPointerFunc =
26478 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26479 return IRB.CreatePointerCast(
26480 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26481 Offset),
26482 IRB.getPtrTy(0));
26483}
26484
26486 // Android provides a fixed TLS slot for the stack cookie. See the definition
26487 // of TLS_SLOT_STACK_GUARD in
26488 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26489 if (Subtarget->isTargetAndroid())
26490 return UseTlsOffset(IRB, 0x28);
26491
26492 // Fuchsia is similar.
26493 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26494 if (Subtarget->isTargetFuchsia())
26495 return UseTlsOffset(IRB, -0x10);
26496
26498}
26499
26501 // MSVC CRT provides functionalities for stack protection.
26502 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26503 // MSVC CRT has a global variable holding security cookie.
26504 M.getOrInsertGlobal("__security_cookie",
26505 PointerType::getUnqual(M.getContext()));
26506
26507 // MSVC CRT has a function to validate security cookie.
26508 FunctionCallee SecurityCheckCookie =
26509 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26510 Type::getVoidTy(M.getContext()),
26511 PointerType::getUnqual(M.getContext()));
26512 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26513 F->setCallingConv(CallingConv::Win64);
26514 F->addParamAttr(0, Attribute::AttrKind::InReg);
26515 }
26516 return;
26517 }
26519}
26520
26522 // MSVC CRT has a global variable holding security cookie.
26523 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26524 return M.getGlobalVariable("__security_cookie");
26526}
26527
26529 // MSVC CRT has a function to validate security cookie.
26530 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26531 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26533}
26534
26535Value *
26537 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26538 // definition of TLS_SLOT_SAFESTACK in
26539 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26540 if (Subtarget->isTargetAndroid())
26541 return UseTlsOffset(IRB, 0x48);
26542
26543 // Fuchsia is similar.
26544 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26545 if (Subtarget->isTargetFuchsia())
26546 return UseTlsOffset(IRB, -0x8);
26547
26549}
26550
26552 const Instruction &AndI) const {
26553 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26554 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26555 // may be beneficial to sink in other cases, but we would have to check that
26556 // the cmp would not get folded into the br to form a cbz for these to be
26557 // beneficial.
26558 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26559 if (!Mask)
26560 return false;
26561 return Mask->getValue().isPowerOf2();
26562}
26563
26567 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26568 SelectionDAG &DAG) const {
26569 // Does baseline recommend not to perform the fold by default?
26571 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26572 return false;
26573 // Else, if this is a vector shift, prefer 'shl'.
26574 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26575}
26576
26579 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26581 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26584 ExpansionFactor);
26585}
26586
26588 // Update IsSplitCSR in AArch64unctionInfo.
26589 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26590 AFI->setIsSplitCSR(true);
26591}
26592
26594 MachineBasicBlock *Entry,
26595 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26596 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26597 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26598 if (!IStart)
26599 return;
26600
26601 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26602 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26603 MachineBasicBlock::iterator MBBI = Entry->begin();
26604 for (const MCPhysReg *I = IStart; *I; ++I) {
26605 const TargetRegisterClass *RC = nullptr;
26606 if (AArch64::GPR64RegClass.contains(*I))
26607 RC = &AArch64::GPR64RegClass;
26608 else if (AArch64::FPR64RegClass.contains(*I))
26609 RC = &AArch64::FPR64RegClass;
26610 else
26611 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26612
26613 Register NewVR = MRI->createVirtualRegister(RC);
26614 // Create copy from CSR to a virtual register.
26615 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26616 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26617 // nounwind. If we want to generalize this later, we may need to emit
26618 // CFI pseudo-instructions.
26619 assert(Entry->getParent()->getFunction().hasFnAttribute(
26620 Attribute::NoUnwind) &&
26621 "Function should be nounwind in insertCopiesSplitCSR!");
26622 Entry->addLiveIn(*I);
26623 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26624 .addReg(*I);
26625
26626 // Insert the copy-back instructions right before the terminator.
26627 for (auto *Exit : Exits)
26628 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26629 TII->get(TargetOpcode::COPY), *I)
26630 .addReg(NewVR);
26631 }
26632}
26633
26635 // Integer division on AArch64 is expensive. However, when aggressively
26636 // optimizing for code size, we prefer to use a div instruction, as it is
26637 // usually smaller than the alternative sequence.
26638 // The exception to this is vector division. Since AArch64 doesn't have vector
26639 // integer division, leaving the division as-is is a loss even in terms of
26640 // size, because it will have to be scalarized, while the alternative code
26641 // sequence can be performed in vector form.
26642 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26643 return OptSize && !VT.isVector();
26644}
26645
26647 // We want inc-of-add for scalars and sub-of-not for vectors.
26648 return VT.isScalarInteger();
26649}
26650
26652 EVT VT) const {
26653 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26654 // legalize.
26655 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26656 return false;
26657 if (FPVT == MVT::v8bf16)
26658 return false;
26659 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26660}
26661
26665 const TargetInstrInfo *TII) const {
26666 assert(MBBI->isCall() && MBBI->getCFIType() &&
26667 "Invalid call instruction for a KCFI check");
26668
26669 switch (MBBI->getOpcode()) {
26670 case AArch64::BLR:
26671 case AArch64::BLRNoIP:
26672 case AArch64::TCRETURNri:
26673 case AArch64::TCRETURNrix16x17:
26674 case AArch64::TCRETURNrix17:
26675 case AArch64::TCRETURNrinotx16:
26676 break;
26677 default:
26678 llvm_unreachable("Unexpected CFI call opcode");
26679 }
26680
26681 MachineOperand &Target = MBBI->getOperand(0);
26682 assert(Target.isReg() && "Invalid target operand for an indirect call");
26683 Target.setIsRenamable(false);
26684
26685 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26686 .addReg(Target.getReg())
26687 .addImm(MBBI->getCFIType())
26688 .getInstr();
26689}
26690
26692 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26693}
26694
26695unsigned
26697 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26698 return getPointerTy(DL).getSizeInBits();
26699
26700 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26701}
26702
26703void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26704 MachineFrameInfo &MFI = MF.getFrameInfo();
26705 // If we have any vulnerable SVE stack objects then the stack protector
26706 // needs to be placed at the top of the SVE stack area, as the SVE locals
26707 // are placed above the other locals, so we allocate it as if it were a
26708 // scalable vector.
26709 // FIXME: It may be worthwhile having a specific interface for this rather
26710 // than doing it here in finalizeLowering.
26711 if (MFI.hasStackProtectorIndex()) {
26712 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26718 break;
26719 }
26720 }
26721 }
26724}
26725
26726// Unlike X86, we let frame lowering assign offsets to all catch objects.
26728 return false;
26729}
26730
26731bool AArch64TargetLowering::shouldLocalize(
26732 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26733 auto &MF = *MI.getMF();
26734 auto &MRI = MF.getRegInfo();
26735 auto maxUses = [](unsigned RematCost) {
26736 // A cost of 1 means remats are basically free.
26737 if (RematCost == 1)
26738 return std::numeric_limits<unsigned>::max();
26739 if (RematCost == 2)
26740 return 2U;
26741
26742 // Remat is too expensive, only sink if there's one user.
26743 if (RematCost > 2)
26744 return 1U;
26745 llvm_unreachable("Unexpected remat cost");
26746 };
26747
26748 unsigned Opc = MI.getOpcode();
26749 switch (Opc) {
26750 case TargetOpcode::G_GLOBAL_VALUE: {
26751 // On Darwin, TLS global vars get selected into function calls, which
26752 // we don't want localized, as they can get moved into the middle of a
26753 // another call sequence.
26754 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26755 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26756 return false;
26757 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26758 }
26759 case TargetOpcode::G_FCONSTANT:
26760 case TargetOpcode::G_CONSTANT: {
26761 const ConstantInt *CI;
26762 unsigned AdditionalCost = 0;
26763
26764 if (Opc == TargetOpcode::G_CONSTANT)
26765 CI = MI.getOperand(1).getCImm();
26766 else {
26767 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26768 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26769 // materialized as integers.
26770 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26771 break;
26772 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26773 bool OptForSize =
26776 OptForSize))
26777 return true; // Constant should be cheap.
26778 CI =
26779 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26780 // FP materialization also costs an extra move, from gpr to fpr.
26781 AdditionalCost = 1;
26782 }
26783 APInt Imm = CI->getValue();
26786 assert(Cost.isValid() && "Expected a valid imm cost");
26787
26788 unsigned RematCost = *Cost.getValue();
26789 RematCost += AdditionalCost;
26790 Register Reg = MI.getOperand(0).getReg();
26791 unsigned MaxUses = maxUses(RematCost);
26792 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26793 if (MaxUses == std::numeric_limits<unsigned>::max())
26794 --MaxUses;
26795 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26796 }
26797 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26798 // localizable.
26799 case AArch64::ADRP:
26800 case AArch64::G_ADD_LOW:
26801 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26802 case TargetOpcode::G_PTR_ADD:
26803 return true;
26804 default:
26805 break;
26806 }
26808}
26809
26811 // Fallback for scalable vectors.
26812 // Note that if EnableSVEGISel is true, we allow scalable vector types for
26813 // all instructions, regardless of whether they are actually supported.
26814 if (!EnableSVEGISel) {
26815 if (Inst.getType()->isScalableTy()) {
26816 return true;
26817 }
26818
26819 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26820 if (Inst.getOperand(i)->getType()->isScalableTy())
26821 return true;
26822
26823 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26824 if (AI->getAllocatedType()->isScalableTy())
26825 return true;
26826 }
26827 }
26828
26829 // Checks to allow the use of SME instructions
26830 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26831 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26832 auto CalleeAttrs = SMEAttrs(*Base);
26833 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26834 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26835 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26836 return true;
26837 }
26838 return false;
26839}
26840
26841// Return the largest legal scalable vector type that matches VT's element type.
26845 "Expected legal fixed length vector!");
26846 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26847 default:
26848 llvm_unreachable("unexpected element type for SVE container");
26849 case MVT::i8:
26850 return EVT(MVT::nxv16i8);
26851 case MVT::i16:
26852 return EVT(MVT::nxv8i16);
26853 case MVT::i32:
26854 return EVT(MVT::nxv4i32);
26855 case MVT::i64:
26856 return EVT(MVT::nxv2i64);
26857 case MVT::bf16:
26858 return EVT(MVT::nxv8bf16);
26859 case MVT::f16:
26860 return EVT(MVT::nxv8f16);
26861 case MVT::f32:
26862 return EVT(MVT::nxv4f32);
26863 case MVT::f64:
26864 return EVT(MVT::nxv2f64);
26865 }
26866}
26867
26868// Return a PTRUE with active lanes corresponding to the extent of VT.
26870 EVT VT) {
26873 "Expected legal fixed length vector!");
26874
26875 std::optional<unsigned> PgPattern =
26877 assert(PgPattern && "Unexpected element count for SVE predicate");
26878
26879 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26880 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26881 // variants of instructions when available.
26882 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26883 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26884 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26885 if (MaxSVESize && MinSVESize == MaxSVESize &&
26886 MaxSVESize == VT.getSizeInBits())
26887 PgPattern = AArch64SVEPredPattern::all;
26888
26889 MVT MaskVT;
26890 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26891 default:
26892 llvm_unreachable("unexpected element type for SVE predicate");
26893 case MVT::i8:
26894 MaskVT = MVT::nxv16i1;
26895 break;
26896 case MVT::i16:
26897 case MVT::f16:
26898 case MVT::bf16:
26899 MaskVT = MVT::nxv8i1;
26900 break;
26901 case MVT::i32:
26902 case MVT::f32:
26903 MaskVT = MVT::nxv4i1;
26904 break;
26905 case MVT::i64:
26906 case MVT::f64:
26907 MaskVT = MVT::nxv2i1;
26908 break;
26909 }
26910
26911 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26912}
26913
26915 EVT VT) {
26917 "Expected legal scalable vector!");
26918 auto PredTy = VT.changeVectorElementType(MVT::i1);
26919 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26920}
26921
26923 if (VT.isFixedLengthVector())
26924 return getPredicateForFixedLengthVector(DAG, DL, VT);
26925
26926 return getPredicateForScalableVector(DAG, DL, VT);
26927}
26928
26929// Grow V to consume an entire SVE register.
26931 assert(VT.isScalableVector() &&
26932 "Expected to convert into a scalable vector!");
26933 assert(V.getValueType().isFixedLengthVector() &&
26934 "Expected a fixed length vector operand!");
26935 SDLoc DL(V);
26936 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26937 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26938}
26939
26940// Shrink V so it's just big enough to maintain a VT's worth of data.
26943 "Expected to convert into a fixed length vector!");
26944 assert(V.getValueType().isScalableVector() &&
26945 "Expected a scalable vector operand!");
26946 SDLoc DL(V);
26947 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26948 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26949}
26950
26951// Convert all fixed length vector loads larger than NEON to masked_loads.
26952SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26953 SDValue Op, SelectionDAG &DAG) const {
26954 auto Load = cast<LoadSDNode>(Op);
26955
26956 SDLoc DL(Op);
26957 EVT VT = Op.getValueType();
26958 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26959 EVT LoadVT = ContainerVT;
26960 EVT MemVT = Load->getMemoryVT();
26961
26962 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26963
26964 if (VT.isFloatingPoint()) {
26965 LoadVT = ContainerVT.changeTypeToInteger();
26966 MemVT = MemVT.changeTypeToInteger();
26967 }
26968
26969 SDValue NewLoad = DAG.getMaskedLoad(
26970 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26971 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26972 Load->getAddressingMode(), Load->getExtensionType());
26973
26974 SDValue Result = NewLoad;
26975 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26976 EVT ExtendVT = ContainerVT.changeVectorElementType(
26977 Load->getMemoryVT().getVectorElementType());
26978
26979 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26981 Pg, Result, DAG.getUNDEF(ContainerVT));
26982 } else if (VT.isFloatingPoint()) {
26983 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26984 }
26985
26986 Result = convertFromScalableVector(DAG, VT, Result);
26987 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26988 return DAG.getMergeValues(MergedValues, DL);
26989}
26990
26992 SelectionDAG &DAG) {
26993 SDLoc DL(Mask);
26994 EVT InVT = Mask.getValueType();
26995 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26996
26997 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26998
26999 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27000 return Pg;
27001
27002 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
27003 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
27004
27006 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
27007}
27008
27009// Convert all fixed length vector loads larger than NEON to masked_loads.
27010SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
27011 SDValue Op, SelectionDAG &DAG) const {
27012 auto Load = cast<MaskedLoadSDNode>(Op);
27013
27014 SDLoc DL(Op);
27015 EVT VT = Op.getValueType();
27016 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27017
27018 SDValue Mask = Load->getMask();
27019 // If this is an extending load and the mask type is not the same as
27020 // load's type then we have to extend the mask type.
27021 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
27022 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
27023 "Incorrect mask type");
27024 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
27025 }
27027
27028 SDValue PassThru;
27029 bool IsPassThruZeroOrUndef = false;
27030
27031 if (Load->getPassThru()->isUndef()) {
27032 PassThru = DAG.getUNDEF(ContainerVT);
27033 IsPassThruZeroOrUndef = true;
27034 } else {
27035 if (ContainerVT.isInteger())
27036 PassThru = DAG.getConstant(0, DL, ContainerVT);
27037 else
27038 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
27039 if (isZerosVector(Load->getPassThru().getNode()))
27040 IsPassThruZeroOrUndef = true;
27041 }
27042
27043 SDValue NewLoad = DAG.getMaskedLoad(
27044 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
27045 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
27046 Load->getAddressingMode(), Load->getExtensionType());
27047
27048 SDValue Result = NewLoad;
27049 if (!IsPassThruZeroOrUndef) {
27050 SDValue OldPassThru =
27051 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
27052 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
27053 }
27054
27055 Result = convertFromScalableVector(DAG, VT, Result);
27056 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
27057 return DAG.getMergeValues(MergedValues, DL);
27058}
27059
27060// Convert all fixed length vector stores larger than NEON to masked_stores.
27061SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
27062 SDValue Op, SelectionDAG &DAG) const {
27063 auto Store = cast<StoreSDNode>(Op);
27064
27065 SDLoc DL(Op);
27066 EVT VT = Store->getValue().getValueType();
27067 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27068 EVT MemVT = Store->getMemoryVT();
27069
27070 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27071 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
27072
27073 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
27074 EVT TruncVT = ContainerVT.changeVectorElementType(
27075 Store->getMemoryVT().getVectorElementType());
27076 MemVT = MemVT.changeTypeToInteger();
27077 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
27078 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
27079 DAG.getUNDEF(TruncVT));
27080 NewValue =
27081 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
27082 } else if (VT.isFloatingPoint()) {
27083 MemVT = MemVT.changeTypeToInteger();
27084 NewValue =
27085 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
27086 }
27087
27088 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
27089 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
27090 Store->getMemOperand(), Store->getAddressingMode(),
27091 Store->isTruncatingStore());
27092}
27093
27094SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
27095 SDValue Op, SelectionDAG &DAG) const {
27096 auto *Store = cast<MaskedStoreSDNode>(Op);
27097
27098 SDLoc DL(Op);
27099 EVT VT = Store->getValue().getValueType();
27100 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27101
27102 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
27104
27105 return DAG.getMaskedStore(
27106 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
27107 Mask, Store->getMemoryVT(), Store->getMemOperand(),
27108 Store->getAddressingMode(), Store->isTruncatingStore());
27109}
27110
27111SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
27112 SDValue Op, SelectionDAG &DAG) const {
27113 SDLoc dl(Op);
27114 EVT VT = Op.getValueType();
27115 EVT EltVT = VT.getVectorElementType();
27116
27117 bool Signed = Op.getOpcode() == ISD::SDIV;
27118 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
27119
27120 bool Negated;
27121 uint64_t SplatVal;
27122 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
27123 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27124 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27125 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
27126
27127 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
27128 SDValue Res =
27129 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
27130 if (Negated)
27131 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
27132 DAG.getConstant(0, dl, ContainerVT), Res);
27133
27134 return convertFromScalableVector(DAG, VT, Res);
27135 }
27136
27137 // Scalable vector i32/i64 DIV is supported.
27138 if (EltVT == MVT::i32 || EltVT == MVT::i64)
27139 return LowerToPredicatedOp(Op, DAG, PredOpcode);
27140
27141 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
27142 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
27143 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
27144 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27145
27146 // If the wider type is legal: extend, op, and truncate.
27147 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27148 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
27149 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
27150 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
27151 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
27152 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
27153 }
27154
27155 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
27156 &ExtendOpcode](SDValue Op) {
27157 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
27158 SDValue IdxHalf =
27159 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
27160 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
27161 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
27162 return std::pair<SDValue, SDValue>(
27163 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
27164 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
27165 };
27166
27167 // If wider type is not legal: split, extend, op, trunc and concat.
27168 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
27169 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
27170 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
27171 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
27172 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
27173 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
27174 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
27175}
27176
27177SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
27178 SDValue Op, SelectionDAG &DAG) const {
27179 EVT VT = Op.getValueType();
27180 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27181
27182 SDLoc DL(Op);
27183 SDValue Val = Op.getOperand(0);
27184 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
27185 Val = convertToScalableVector(DAG, ContainerVT, Val);
27186
27187 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
27188 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
27189
27190 // Repeatedly unpack Val until the result is of the desired element type.
27191 switch (ContainerVT.getSimpleVT().SimpleTy) {
27192 default:
27193 llvm_unreachable("unimplemented container type");
27194 case MVT::nxv16i8:
27195 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
27196 if (VT.getVectorElementType() == MVT::i16)
27197 break;
27198 [[fallthrough]];
27199 case MVT::nxv8i16:
27200 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
27201 if (VT.getVectorElementType() == MVT::i32)
27202 break;
27203 [[fallthrough]];
27204 case MVT::nxv4i32:
27205 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
27206 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
27207 break;
27208 }
27209
27210 return convertFromScalableVector(DAG, VT, Val);
27211}
27212
27213SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
27214 SDValue Op, SelectionDAG &DAG) const {
27215 EVT VT = Op.getValueType();
27216 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27217
27218 SDLoc DL(Op);
27219 SDValue Val = Op.getOperand(0);
27220 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
27221 Val = convertToScalableVector(DAG, ContainerVT, Val);
27222
27223 // Repeatedly truncate Val until the result is of the desired element type.
27224 switch (ContainerVT.getSimpleVT().SimpleTy) {
27225 default:
27226 llvm_unreachable("unimplemented container type");
27227 case MVT::nxv2i64:
27228 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
27229 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
27230 if (VT.getVectorElementType() == MVT::i32)
27231 break;
27232 [[fallthrough]];
27233 case MVT::nxv4i32:
27234 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
27235 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
27236 if (VT.getVectorElementType() == MVT::i16)
27237 break;
27238 [[fallthrough]];
27239 case MVT::nxv8i16:
27240 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
27241 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
27242 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
27243 break;
27244 }
27245
27246 return convertFromScalableVector(DAG, VT, Val);
27247}
27248
27249SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
27250 SDValue Op, SelectionDAG &DAG) const {
27251 EVT VT = Op.getValueType();
27252 EVT InVT = Op.getOperand(0).getValueType();
27253 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
27254
27255 SDLoc DL(Op);
27256 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27257 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
27258
27259 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
27260}
27261
27262SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
27263 SDValue Op, SelectionDAG &DAG) const {
27264 EVT VT = Op.getValueType();
27265 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27266
27267 SDLoc DL(Op);
27268 EVT InVT = Op.getOperand(0).getValueType();
27269 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27270 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
27271
27272 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
27273 Op.getOperand(1), Op.getOperand(2));
27274
27275 return convertFromScalableVector(DAG, VT, ScalableRes);
27276}
27277
27278// Convert vector operation 'Op' to an equivalent predicated operation whereby
27279// the original operation's type is used to construct a suitable predicate.
27280// NOTE: The results for inactive lanes are undefined.
27281SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
27282 SelectionDAG &DAG,
27283 unsigned NewOp) const {
27284 EVT VT = Op.getValueType();
27285 SDLoc DL(Op);
27286 auto Pg = getPredicateForVector(DAG, DL, VT);
27287
27288 if (VT.isFixedLengthVector()) {
27289 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
27290 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27291
27292 // Create list of operands by converting existing ones to scalable types.
27294 for (const SDValue &V : Op->op_values()) {
27295 if (isa<CondCodeSDNode>(V)) {
27296 Operands.push_back(V);
27297 continue;
27298 }
27299
27300 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
27301 EVT VTArg = VTNode->getVT().getVectorElementType();
27302 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
27303 Operands.push_back(DAG.getValueType(NewVTArg));
27304 continue;
27305 }
27306
27307 assert(isTypeLegal(V.getValueType()) &&
27308 "Expected only legal fixed-width types");
27309 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
27310 }
27311
27312 if (isMergePassthruOpcode(NewOp))
27313 Operands.push_back(DAG.getUNDEF(ContainerVT));
27314
27315 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
27316 return convertFromScalableVector(DAG, VT, ScalableRes);
27317 }
27318
27319 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
27320
27322 for (const SDValue &V : Op->op_values()) {
27323 assert((!V.getValueType().isVector() ||
27324 V.getValueType().isScalableVector()) &&
27325 "Only scalable vectors are supported!");
27326 Operands.push_back(V);
27327 }
27328
27329 if (isMergePassthruOpcode(NewOp))
27330 Operands.push_back(DAG.getUNDEF(VT));
27331
27332 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
27333}
27334
27335// If a fixed length vector operation has no side effects when applied to
27336// undefined elements, we can safely use scalable vectors to perform the same
27337// operation without needing to worry about predication.
27338SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
27339 SelectionDAG &DAG) const {
27340 EVT VT = Op.getValueType();
27342 "Only expected to lower fixed length vector operation!");
27343 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27344
27345 // Create list of operands by converting existing ones to scalable types.
27347 for (const SDValue &V : Op->op_values()) {
27348 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
27349
27350 // Pass through non-vector operands.
27351 if (!V.getValueType().isVector()) {
27352 Ops.push_back(V);
27353 continue;
27354 }
27355
27356 // "cast" fixed length vector to a scalable vector.
27357 assert(V.getValueType().isFixedLengthVector() &&
27358 isTypeLegal(V.getValueType()) &&
27359 "Only fixed length vectors are supported!");
27360 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
27361 }
27362
27363 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
27364 return convertFromScalableVector(DAG, VT, ScalableRes);
27365}
27366
27367SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
27368 SelectionDAG &DAG) const {
27369 SDLoc DL(ScalarOp);
27370 SDValue AccOp = ScalarOp.getOperand(0);
27371 SDValue VecOp = ScalarOp.getOperand(1);
27372 EVT SrcVT = VecOp.getValueType();
27373 EVT ResVT = SrcVT.getVectorElementType();
27374
27375 EVT ContainerVT = SrcVT;
27376 if (SrcVT.isFixedLengthVector()) {
27377 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27378 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27379 }
27380
27381 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27382 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27383
27384 // Convert operands to Scalable.
27385 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
27386 DAG.getUNDEF(ContainerVT), AccOp, Zero);
27387
27388 // Perform reduction.
27389 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
27390 Pg, AccOp, VecOp);
27391
27392 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
27393}
27394
27395SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27396 SelectionDAG &DAG) const {
27397 SDLoc DL(ReduceOp);
27398 SDValue Op = ReduceOp.getOperand(0);
27399 EVT OpVT = Op.getValueType();
27400 EVT VT = ReduceOp.getValueType();
27401
27402 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
27403 return SDValue();
27404
27405 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
27406
27407 switch (ReduceOp.getOpcode()) {
27408 default:
27409 return SDValue();
27410 case ISD::VECREDUCE_OR:
27411 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
27412 // The predicate can be 'Op' because
27413 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27414 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
27415 else
27416 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
27417 case ISD::VECREDUCE_AND: {
27418 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
27419 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
27420 }
27421 case ISD::VECREDUCE_XOR: {
27422 SDValue ID =
27423 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
27424 if (OpVT == MVT::nxv1i1) {
27425 // Emulate a CNTP on .Q using .D and a different governing predicate.
27426 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
27427 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
27428 }
27429 SDValue Cntp =
27430 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
27431 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
27432 }
27433 }
27434
27435 return SDValue();
27436}
27437
27438SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27439 SDValue ScalarOp,
27440 SelectionDAG &DAG) const {
27441 SDLoc DL(ScalarOp);
27442 SDValue VecOp = ScalarOp.getOperand(0);
27443 EVT SrcVT = VecOp.getValueType();
27444
27446 SrcVT,
27447 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27448 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27449 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27450 }
27451
27452 // UADDV always returns an i64 result.
27453 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27454 SrcVT.getVectorElementType();
27455 EVT RdxVT = SrcVT;
27456 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27457 RdxVT = getPackedSVEVectorVT(ResVT);
27458
27459 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27460 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27461 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27462 Rdx, DAG.getConstant(0, DL, MVT::i64));
27463
27464 // The VEC_REDUCE nodes expect an element size result.
27465 if (ResVT != ScalarOp.getValueType())
27466 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27467
27468 return Res;
27469}
27470
27471SDValue
27472AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27473 SelectionDAG &DAG) const {
27474 EVT VT = Op.getValueType();
27475 SDLoc DL(Op);
27476
27477 EVT InVT = Op.getOperand(1).getValueType();
27478 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27479 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27480 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27481
27482 // Convert the mask to a predicated (NOTE: We don't need to worry about
27483 // inactive lanes since VSELECT is safe when given undefined elements).
27484 EVT MaskVT = Op.getOperand(0).getValueType();
27485 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27486 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27488 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27489
27490 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27491 Mask, Op1, Op2);
27492
27493 return convertFromScalableVector(DAG, VT, ScalableRes);
27494}
27495
27496SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27497 SDValue Op, SelectionDAG &DAG) const {
27498 SDLoc DL(Op);
27499 EVT InVT = Op.getOperand(0).getValueType();
27500 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27501
27502 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27503 "Only expected to lower fixed length vector operation!");
27504 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27505 "Expected integer result of the same bit length as the inputs!");
27506
27507 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27508 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27509 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27510
27511 EVT CmpVT = Pg.getValueType();
27512 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27513 {Pg, Op1, Op2, Op.getOperand(2)});
27514
27515 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27516 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27517 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27518}
27519
27520SDValue
27521AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27522 SelectionDAG &DAG) const {
27523 SDLoc DL(Op);
27524 auto SrcOp = Op.getOperand(0);
27525 EVT VT = Op.getValueType();
27526 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27527 EVT ContainerSrcVT =
27528 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27529
27530 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27531 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27532 return convertFromScalableVector(DAG, VT, Op);
27533}
27534
27535SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27536 SDValue Op, SelectionDAG &DAG) const {
27537 SDLoc DL(Op);
27538 unsigned NumOperands = Op->getNumOperands();
27539
27540 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27541 "Unexpected number of operands in CONCAT_VECTORS");
27542
27543 auto SrcOp1 = Op.getOperand(0);
27544 auto SrcOp2 = Op.getOperand(1);
27545 EVT VT = Op.getValueType();
27546 EVT SrcVT = SrcOp1.getValueType();
27547
27548 if (NumOperands > 2) {
27550 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27551 for (unsigned I = 0; I < NumOperands; I += 2)
27552 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27553 Op->getOperand(I), Op->getOperand(I + 1)));
27554
27555 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27556 }
27557
27558 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27559
27561 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27562 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27563
27564 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27565
27566 return convertFromScalableVector(DAG, VT, Op);
27567}
27568
27569SDValue
27570AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27571 SelectionDAG &DAG) const {
27572 EVT VT = Op.getValueType();
27573 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27574
27575 SDLoc DL(Op);
27576 SDValue Val = Op.getOperand(0);
27577 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27578 EVT SrcVT = Val.getValueType();
27579 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27580 EVT ExtendVT = ContainerVT.changeVectorElementType(
27581 SrcVT.getVectorElementType());
27582
27583 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27584 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27585
27586 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27587 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27588 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27589 Pg, Val, DAG.getUNDEF(ContainerVT));
27590
27591 return convertFromScalableVector(DAG, VT, Val);
27592}
27593
27594SDValue
27595AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27596 SelectionDAG &DAG) const {
27597 EVT VT = Op.getValueType();
27598 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27599
27600 SDLoc DL(Op);
27601 SDValue Val = Op.getOperand(0);
27602 EVT SrcVT = Val.getValueType();
27603 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27604 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27606 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27607
27608 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27609 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27610 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27611 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27612 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27613
27614 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27615 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27616}
27617
27618SDValue
27619AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27620 SelectionDAG &DAG) const {
27621 EVT VT = Op.getValueType();
27622 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27623
27624 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27625 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27627
27628 SDLoc DL(Op);
27629 SDValue Val = Op.getOperand(0);
27630 EVT SrcVT = Val.getValueType();
27631 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27632 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27633
27634 if (VT.bitsGE(SrcVT)) {
27636
27637 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27638 VT.changeTypeToInteger(), Val);
27639
27640 // Safe to use a larger than specified operand because by promoting the
27641 // value nothing has changed from an arithmetic point of view.
27642 Val =
27643 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27644 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27645 DAG.getUNDEF(ContainerDstVT));
27646 return convertFromScalableVector(DAG, VT, Val);
27647 } else {
27648 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27649 ContainerDstVT.getVectorElementType());
27651
27652 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27653 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27654 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27655 Val = convertFromScalableVector(DAG, SrcVT, Val);
27656
27657 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27658 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27659 }
27660}
27661
27662SDValue
27663AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27664 SelectionDAG &DAG) const {
27665 SDLoc DL(Op);
27666 EVT OpVT = Op.getValueType();
27667 assert(OpVT.isScalableVector() &&
27668 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27669 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27670 Op.getOperand(1));
27671 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27672 Op.getOperand(1));
27673 return DAG.getMergeValues({Even, Odd}, DL);
27674}
27675
27676SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27677 SelectionDAG &DAG) const {
27678 SDLoc DL(Op);
27679 EVT OpVT = Op.getValueType();
27680 assert(OpVT.isScalableVector() &&
27681 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27682
27683 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27684 Op.getOperand(1));
27685 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27686 Op.getOperand(1));
27687 return DAG.getMergeValues({Lo, Hi}, DL);
27688}
27689
27690SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27691 SelectionDAG &DAG) const {
27692 // FIXME: Maybe share some code with LowerMGather/Scatter?
27693 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
27694 SDLoc DL(HG);
27695 SDValue Chain = HG->getChain();
27696 SDValue Inc = HG->getInc();
27697 SDValue Mask = HG->getMask();
27698 SDValue Ptr = HG->getBasePtr();
27699 SDValue Index = HG->getIndex();
27700 SDValue Scale = HG->getScale();
27701 SDValue IntID = HG->getIntID();
27702
27703 // The Intrinsic ID determines the type of update operation.
27704 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
27705 // Right now, we only support 'add' as an update.
27706 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27707 "Unexpected histogram update operation");
27708
27709 EVT IncVT = Inc.getValueType();
27710 EVT IndexVT = Index.getValueType();
27711 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
27712 IndexVT.getVectorElementCount());
27713 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27714 SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
27715 SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
27716 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27717
27718 MachineMemOperand *MMO = HG->getMemOperand();
27719 // Create an MMO for the gather, without load|store flags.
27722 MMO->getAlign(), MMO->getAAInfo());
27723 ISD::MemIndexType IndexType = HG->getIndexType();
27724 SDValue Gather =
27725 DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
27726 GMMO, IndexType, ISD::NON_EXTLOAD);
27727
27728 SDValue GChain = Gather.getValue(1);
27729
27730 // Perform the histcnt, multiply by inc, add to bucket data.
27731 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
27732 SDValue HistCnt =
27733 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
27734 SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
27735 SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
27736
27737 // Create an MMO for the scatter, without load|store flags.
27740 MMO->getAlign(), MMO->getAAInfo());
27741
27742 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
27743 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
27744 ScatterOps, SMMO, IndexType, false);
27745 return Scatter;
27746}
27747
27748SDValue
27749AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27750 SelectionDAG &DAG) const {
27751 EVT VT = Op.getValueType();
27752 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27753
27754 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27755 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27757
27758 SDLoc DL(Op);
27759 SDValue Val = Op.getOperand(0);
27760 EVT SrcVT = Val.getValueType();
27761 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27762 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27763
27764 if (VT.bitsGT(SrcVT)) {
27765 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27766 ContainerSrcVT.getVectorElementType());
27768
27769 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27770 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27771
27772 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27773 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27774 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27775 DAG.getUNDEF(ContainerDstVT));
27776 return convertFromScalableVector(DAG, VT, Val);
27777 } else {
27778 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27780
27781 // Safe to use a larger than specified result since an fp_to_int where the
27782 // result doesn't fit into the destination is undefined.
27783 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27784 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27785 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27786
27787 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27788 }
27789}
27790
27792 ArrayRef<int> ShuffleMask, EVT VT,
27793 EVT ContainerVT, SelectionDAG &DAG) {
27794 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27795 SDLoc DL(Op);
27796 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27797 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27798 bool IsSingleOp =
27799 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27800
27801 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27802 MinSVESize = 128;
27803
27804 // Ignore two operands if no SVE2 or all index numbers couldn't
27805 // be represented.
27806 if (!IsSingleOp && !Subtarget.hasSVE2())
27807 return SDValue();
27808
27809 EVT VTOp1 = Op.getOperand(0).getValueType();
27810 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27811 unsigned IndexLen = MinSVESize / BitsPerElt;
27812 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27813 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27814 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27815 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27816 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27817 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27818 "Incorrectly legalised shuffle operation");
27819
27821 // If MinSVESize is not equal to MaxSVESize then we need to know which
27822 // TBL mask element needs adjustment.
27823 SmallVector<SDValue, 8> AddRuntimeVLMask;
27824
27825 // Bail out for 8-bits element types, because with 2048-bit SVE register
27826 // size 8 bits is only sufficient to index into the first source vector.
27827 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27828 return SDValue();
27829
27830 for (int Index : ShuffleMask) {
27831 // Handling poison index value.
27832 if (Index < 0)
27833 Index = 0;
27834 // If the mask refers to elements in the second operand, then we have to
27835 // offset the index by the number of elements in a vector. If this is number
27836 // is not known at compile-time, we need to maintain a mask with 'VL' values
27837 // to add at runtime.
27838 if ((unsigned)Index >= ElementsPerVectorReg) {
27839 if (MinMaxEqual) {
27840 Index += IndexLen - ElementsPerVectorReg;
27841 } else {
27842 Index = Index - ElementsPerVectorReg;
27843 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27844 }
27845 } else if (!MinMaxEqual)
27846 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27847 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27848 // to 255, this might point to the last element of in the second operand
27849 // of the shufflevector, thus we are rejecting this transform.
27850 if ((unsigned)Index >= MaxOffset)
27851 return SDValue();
27852 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27853 }
27854
27855 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27856 // value where it would perform first lane duplication for out of
27857 // index elements. For i8 elements an out-of-range index could be a valid
27858 // for 2048-bit vector register size.
27859 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27860 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27861 if (!MinMaxEqual)
27862 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27863 }
27864
27865 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27866 SDValue VecMask =
27867 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27868 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27869
27870 SDValue Shuffle;
27871 if (IsSingleOp)
27872 Shuffle =
27873 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27874 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27875 Op1, SVEMask);
27876 else if (Subtarget.hasSVE2()) {
27877 if (!MinMaxEqual) {
27878 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27879 SDValue VScale = (BitsPerElt == 64)
27880 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27881 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27882 SDValue VecMask =
27883 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27884 SDValue MulByMask = DAG.getNode(
27885 ISD::MUL, DL, MaskType,
27886 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27887 DAG.getBuildVector(MaskType, DL,
27888 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27889 SDValue UpdatedVecMask =
27890 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27891 SVEMask = convertToScalableVector(
27892 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27893 }
27894 Shuffle =
27895 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27896 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27897 Op1, Op2, SVEMask);
27898 }
27899 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27900 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27901}
27902
27903SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27904 SDValue Op, SelectionDAG &DAG) const {
27905 EVT VT = Op.getValueType();
27906 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27907
27908 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27909 auto ShuffleMask = SVN->getMask();
27910
27911 SDLoc DL(Op);
27912 SDValue Op1 = Op.getOperand(0);
27913 SDValue Op2 = Op.getOperand(1);
27914
27915 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27916 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27917 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27918
27919 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27920 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27921 return MVT::i32;
27922 return ScalarTy;
27923 };
27924
27925 if (SVN->isSplat()) {
27926 unsigned Lane = std::max(0, SVN->getSplatIndex());
27927 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27928 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27929 DAG.getConstant(Lane, DL, MVT::i64));
27930 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27931 return convertFromScalableVector(DAG, VT, Op);
27932 }
27933
27934 bool ReverseEXT = false;
27935 unsigned Imm;
27936 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27937 Imm == VT.getVectorNumElements() - 1) {
27938 if (ReverseEXT)
27939 std::swap(Op1, Op2);
27940 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27941 SDValue Scalar = DAG.getNode(
27942 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27943 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27944 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27945 return convertFromScalableVector(DAG, VT, Op);
27946 }
27947
27948 unsigned EltSize = VT.getScalarSizeInBits();
27949 for (unsigned LaneSize : {64U, 32U, 16U}) {
27950 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
27951 EVT NewVT =
27953 unsigned RevOp;
27954 if (EltSize == 8)
27956 else if (EltSize == 16)
27958 else
27960
27961 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27962 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27963 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27964 return convertFromScalableVector(DAG, VT, Op);
27965 }
27966 }
27967
27968 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
27969 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
27970 if (!VT.isFloatingPoint())
27971 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27972
27974 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27975 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27976 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27977 return convertFromScalableVector(DAG, VT, Op);
27978 }
27979
27980 unsigned WhichResult;
27981 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
27982 WhichResult == 0)
27984 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27985
27986 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
27987 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27989 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27990 }
27991
27992 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27994 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27995
27996 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27997 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27999 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
28000 }
28001
28002 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
28003 // represents the same logical operation as performed by a ZIP instruction. In
28004 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
28005 // equivalent to an AArch64 instruction. There's the extra component of
28006 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
28007 // only operated on 64/128bit vector types that have a direct mapping to a
28008 // target register and so an exact mapping is implied.
28009 // However, when using SVE for fixed length vectors, most legal vector types
28010 // are actually sub-vectors of a larger SVE register. When mapping
28011 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
28012 // how the mask's indices translate. Specifically, when the mapping requires
28013 // an exact meaning for a specific vector index (e.g. Index X is the last
28014 // vector element in the register) then such mappings are often only safe when
28015 // the exact SVE register size is know. The main exception to this is when
28016 // indices are logically relative to the first element of either
28017 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
28018 // when converting from fixed-length to scalable vector types (i.e. the start
28019 // of a fixed length vector is always the start of a scalable vector).
28020 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
28021 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
28022 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
28023 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
28024 Op2.isUndef()) {
28025 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
28026 return convertFromScalableVector(DAG, VT, Op);
28027 }
28028
28029 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
28030 WhichResult != 0)
28032 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
28033
28034 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
28035 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28037 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
28038 }
28039
28040 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
28042 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
28043
28044 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
28045 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28047 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
28048 }
28049 }
28050
28051 // Avoid producing TBL instruction if we don't know SVE register minimal size,
28052 // unless NEON is not available and we can assume minimal SVE register size is
28053 // 128-bits.
28054 if (MinSVESize || !Subtarget->isNeonAvailable())
28055 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
28056 DAG);
28057
28058 return SDValue();
28059}
28060
28061SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
28062 SelectionDAG &DAG) const {
28063 SDLoc DL(Op);
28064 EVT InVT = Op.getValueType();
28065
28066 assert(VT.isScalableVector() && isTypeLegal(VT) &&
28067 InVT.isScalableVector() && isTypeLegal(InVT) &&
28068 "Only expect to cast between legal scalable vector types!");
28069 assert(VT.getVectorElementType() != MVT::i1 &&
28070 InVT.getVectorElementType() != MVT::i1 &&
28071 "For predicate bitcasts, use getSVEPredicateBitCast");
28072
28073 if (InVT == VT)
28074 return Op;
28075
28077 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
28078
28079 // Safe bitcasting between unpacked vector types of different element counts
28080 // is currently unsupported because the following is missing the necessary
28081 // work to ensure the result's elements live where they're supposed to within
28082 // an SVE register.
28083 // 01234567
28084 // e.g. nxv2i32 = XX??XX??
28085 // nxv4f16 = X?X?X?X?
28087 VT == PackedVT || InVT == PackedInVT) &&
28088 "Unexpected bitcast!");
28089
28090 // Pack input if required.
28091 if (InVT != PackedInVT)
28092 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
28093
28094 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
28095
28096 // Unpack result if required.
28097 if (VT != PackedVT)
28099
28100 return Op;
28101}
28102
28104 SDValue N) const {
28105 return ::isAllActivePredicate(DAG, N);
28106}
28107
28109 return ::getPromotedVTForPredicate(VT);
28110}
28111
28112bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
28113 SDValue Op, const APInt &OriginalDemandedBits,
28114 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
28115 unsigned Depth) const {
28116
28117 unsigned Opc = Op.getOpcode();
28118 switch (Opc) {
28119 case AArch64ISD::VSHL: {
28120 // Match (VSHL (VLSHR Val X) X)
28121 SDValue ShiftL = Op;
28122 SDValue ShiftR = Op->getOperand(0);
28123 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
28124 return false;
28125
28126 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
28127 return false;
28128
28129 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
28130 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
28131
28132 // Other cases can be handled as well, but this is not
28133 // implemented.
28134 if (ShiftRBits != ShiftLBits)
28135 return false;
28136
28137 unsigned ScalarSize = Op.getScalarValueSizeInBits();
28138 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
28139
28140 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
28141 APInt UnusedBits = ~OriginalDemandedBits;
28142
28143 if ((ZeroBits & UnusedBits) != ZeroBits)
28144 return false;
28145
28146 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
28147 // used - simplify to just Val.
28148 return TLO.CombineTo(Op, ShiftR->getOperand(0));
28149 }
28150 case AArch64ISD::BICi: {
28151 // Fold BICi if all destination bits already known to be zeroed
28152 SDValue Op0 = Op.getOperand(0);
28153 KnownBits KnownOp0 =
28154 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
28155 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
28156 uint64_t BitsToClear = Op->getConstantOperandVal(1)
28157 << Op->getConstantOperandVal(2);
28158 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
28159 if (APInt(Known.getBitWidth(), BitsToClear)
28160 .isSubsetOf(AlreadyZeroedBitsToClear))
28161 return TLO.CombineTo(Op, Op0);
28162
28163 Known = KnownOp0 &
28164 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
28165
28166 return false;
28167 }
28169 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
28170 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
28171 if (!MaxSVEVectorSizeInBits)
28172 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
28173 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
28174 // The SVE count intrinsics don't support the multiplier immediate so we
28175 // don't have to account for that here. The value returned may be slightly
28176 // over the true required bits, as this is based on the "ALL" pattern. The
28177 // other patterns are also exposed by these intrinsics, but they all
28178 // return a value that's strictly less than "ALL".
28179 unsigned RequiredBits = llvm::bit_width(MaxElements);
28180 unsigned BitWidth = Known.Zero.getBitWidth();
28181 if (RequiredBits < BitWidth)
28182 Known.Zero.setHighBits(BitWidth - RequiredBits);
28183 return false;
28184 }
28185 }
28186 }
28187
28189 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
28190}
28191
28192bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
28193 return Op.getOpcode() == AArch64ISD::DUP ||
28194 Op.getOpcode() == AArch64ISD::MOVI ||
28195 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
28196 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
28198}
28199
28201 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
28202 Subtarget->hasComplxNum();
28203}
28204
28207 auto *VTy = dyn_cast<VectorType>(Ty);
28208 if (!VTy)
28209 return false;
28210
28211 // If the vector is scalable, SVE is enabled, implying support for complex
28212 // numbers. Otherwise, we need to ensure complex number support is available
28213 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
28214 return false;
28215
28216 auto *ScalarTy = VTy->getScalarType();
28217 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
28218
28219 // We can only process vectors that have a bit size of 128 or higher (with an
28220 // additional 64 bits for Neon). Additionally, these vectors must have a
28221 // power-of-2 size, as we later split them into the smallest supported size
28222 // and merging them back together after applying complex operation.
28223 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
28224 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
28225 !llvm::isPowerOf2_32(VTyWidth))
28226 return false;
28227
28228 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
28229 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
28230 return 8 <= ScalarWidth && ScalarWidth <= 64;
28231 }
28232
28233 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
28234 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
28235}
28236
28239 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
28240 Value *Accumulator) const {
28241 VectorType *Ty = cast<VectorType>(InputA->getType());
28242 bool IsScalable = Ty->isScalableTy();
28243 bool IsInt = Ty->getElementType()->isIntegerTy();
28244
28245 unsigned TyWidth =
28247
28248 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
28249 "Vector type must be either 64 or a power of 2 that is at least 128");
28250
28251 if (TyWidth > 128) {
28252 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
28253 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
28254 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
28255 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
28256 auto *UpperSplitA =
28257 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
28258 auto *UpperSplitB =
28259 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
28260 Value *LowerSplitAcc = nullptr;
28261 Value *UpperSplitAcc = nullptr;
28262 if (Accumulator) {
28263 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
28264 UpperSplitAcc =
28265 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
28266 }
28267 auto *LowerSplitInt = createComplexDeinterleavingIR(
28268 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
28269 auto *UpperSplitInt = createComplexDeinterleavingIR(
28270 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
28271
28272 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
28273 B.getInt64(0));
28274 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
28275 }
28276
28277 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
28278 if (Accumulator == nullptr)
28280
28281 if (IsScalable) {
28282 if (IsInt)
28283 return B.CreateIntrinsic(
28284 Intrinsic::aarch64_sve_cmla_x, Ty,
28285 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
28286
28287 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
28288 return B.CreateIntrinsic(
28289 Intrinsic::aarch64_sve_fcmla, Ty,
28290 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
28291 }
28292
28293 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
28294 Intrinsic::aarch64_neon_vcmla_rot90,
28295 Intrinsic::aarch64_neon_vcmla_rot180,
28296 Intrinsic::aarch64_neon_vcmla_rot270};
28297
28298
28299 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
28300 {Accumulator, InputA, InputB});
28301 }
28302
28303 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
28304 if (IsScalable) {
28307 if (IsInt)
28308 return B.CreateIntrinsic(
28309 Intrinsic::aarch64_sve_cadd_x, Ty,
28310 {InputA, InputB, B.getInt32((int)Rotation * 90)});
28311
28312 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
28313 return B.CreateIntrinsic(
28314 Intrinsic::aarch64_sve_fcadd, Ty,
28315 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
28316 }
28317 return nullptr;
28318 }
28319
28322 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
28324 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
28325
28326 if (IntId == Intrinsic::not_intrinsic)
28327 return nullptr;
28328
28329 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
28330 }
28331
28332 return nullptr;
28333}
28334
28335bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
28336 unsigned Opc = N->getOpcode();
28337 if (ISD::isExtOpcode(Opc)) {
28338 if (any_of(N->uses(),
28339 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
28340 return false;
28341 }
28342 return true;
28343}
28344
28345unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
28346 return Subtarget->getMinimumJumpTableEntries();
28347}
28348
28351 EVT VT) const {
28352 bool NonUnitFixedLengthVector =
28354 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28356
28357 EVT VT1;
28358 MVT RegisterVT;
28359 unsigned NumIntermediates;
28360 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
28361 RegisterVT);
28362 return RegisterVT;
28363}
28364
28366 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
28367 bool NonUnitFixedLengthVector =
28369 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28371
28372 EVT VT1;
28373 MVT VT2;
28374 unsigned NumIntermediates;
28375 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
28376 NumIntermediates, VT2);
28377}
28378
28380 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
28381 unsigned &NumIntermediates, MVT &RegisterVT) const {
28383 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
28384 if (!RegisterVT.isFixedLengthVector() ||
28385 RegisterVT.getFixedSizeInBits() <= 128)
28386 return NumRegs;
28387
28388 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
28389 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
28390 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
28391
28392 // A size mismatch here implies either type promotion or widening and would
28393 // have resulted in scalarisation if larger vectors had not be available.
28394 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
28395 EVT EltTy = VT.getVectorElementType();
28396 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
28397 if (!isTypeLegal(NewVT))
28398 NewVT = EltTy;
28399
28400 IntermediateVT = NewVT;
28401 NumIntermediates = VT.getVectorNumElements();
28402 RegisterVT = getRegisterType(Context, NewVT);
28403 return NumIntermediates;
28404 }
28405
28406 // SVE VLS support does not introduce a new ABI so we should use NEON sized
28407 // types for vector arguments and returns.
28408
28409 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
28410 NumIntermediates *= NumSubRegs;
28411 NumRegs *= NumSubRegs;
28412
28413 switch (RegisterVT.getVectorElementType().SimpleTy) {
28414 default:
28415 llvm_unreachable("unexpected element type for vector");
28416 case MVT::i8:
28417 IntermediateVT = RegisterVT = MVT::v16i8;
28418 break;
28419 case MVT::i16:
28420 IntermediateVT = RegisterVT = MVT::v8i16;
28421 break;
28422 case MVT::i32:
28423 IntermediateVT = RegisterVT = MVT::v4i32;
28424 break;
28425 case MVT::i64:
28426 IntermediateVT = RegisterVT = MVT::v2i64;
28427 break;
28428 case MVT::f16:
28429 IntermediateVT = RegisterVT = MVT::v8f16;
28430 break;
28431 case MVT::f32:
28432 IntermediateVT = RegisterVT = MVT::v4f32;
28433 break;
28434 case MVT::f64:
28435 IntermediateVT = RegisterVT = MVT::v2f64;
28436 break;
28437 case MVT::bf16:
28438 IntermediateVT = RegisterVT = MVT::v8bf16;
28439 break;
28440 }
28441
28442 return NumRegs;
28443}
28444
28446 const MachineFunction &MF) const {
28447 return !Subtarget->isTargetWindows() &&
28448 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28449}
28450
28451#ifndef NDEBUG
28453 switch (N->getOpcode()) {
28454 default:
28455 break;
28459 case AArch64ISD::UUNPKHI: {
28460 assert(N->getNumValues() == 1 && "Expected one result!");
28461 assert(N->getNumOperands() == 1 && "Expected one operand!");
28462 EVT VT = N->getValueType(0);
28463 EVT OpVT = N->getOperand(0).getValueType();
28464 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28465 VT.isInteger() && "Expected integer vectors!");
28466 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28467 "Expected vectors of equal size!");
28468 // TODO: Enable assert once bogus creations have been fixed.
28469 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
28470 // "Expected result vector with half the lanes of its input!");
28471 break;
28472 }
28473 case AArch64ISD::TRN1:
28474 case AArch64ISD::TRN2:
28475 case AArch64ISD::UZP1:
28476 case AArch64ISD::UZP2:
28477 case AArch64ISD::ZIP1:
28478 case AArch64ISD::ZIP2: {
28479 assert(N->getNumValues() == 1 && "Expected one result!");
28480 assert(N->getNumOperands() == 2 && "Expected two operands!");
28481 EVT VT = N->getValueType(0);
28482 EVT Op0VT = N->getOperand(0).getValueType();
28483 EVT Op1VT = N->getOperand(1).getValueType();
28484 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28485 "Expected vectors!");
28486 // TODO: Enable assert once bogus creations have been fixed.
28487 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28488 break;
28489 }
28490 }
28491}
28492#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F32
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:428
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:208
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1371
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:350
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:188
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:308
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1145
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1597
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:198
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1718
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:806
bool isMask(unsigned numBits) const
Definition: APInt.h:467
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:313
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1236
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:419
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:285
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:275
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1216
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:368
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
an instruction to allocate memory on the stack
Definition: Instructions.h:60
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:494
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:725
@ Or
*p = old | v
Definition: Instructions.h:719
@ And
*p = old & v
Definition: Instructions.h:715
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:723
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:729
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:727
@ Nand
*p = ~(old & v)
Definition: Instructions.h:717
bool isFloatingPointOperation() const
Definition: Instructions.h:863
BinOp getOperation() const
Definition: Instructions.h:786
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:551
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
bool empty() const
Definition: Function.h:822
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1934
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
arg_iterator arg_end()
Definition: Function.h:840
arg_iterator arg_begin()
Definition: Function.h:831
size_t size() const
Definition: Function.h:821
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:690
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:124
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2134
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1034
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2469
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1876
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2520
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1042
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2167
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2513
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2064
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2119
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1434
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:473
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2078
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1863
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2124
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1413
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2018
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2491
LLVMContext & getContext() const
Definition: IRBuilder.h:173
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2114
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2004
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1494
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:566
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2409
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2663
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:173
Value * getPointerOperand()
Definition: Instructions.h:252
Type * getPointerOperandType() const
Definition: Instructions.h:255
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:701
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:746
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:494
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:673
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:876
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:782
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:859
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:753
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:289
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:463
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:677
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:634
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1147
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1143
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:484
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1360
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1391
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:567
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1176
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1262
@ STRICT_FCEIL
Definition: ISDOpcodes.h:434
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1042
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:444
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1376
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1380
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1046
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1390
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:485
@ STRICT_FLOG2
Definition: ISDOpcodes.h:429
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1288
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1289
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1431
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:899
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:670
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:458
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:628
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1373
@ WRITE_REGISTER
Definition: ISDOpcodes.h:125
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1242
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1377
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1009
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:772
@ STRICT_LROUND
Definition: ISDOpcodes.h:439
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1098
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1073
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:594
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:654
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ STRICT_FPOWI
Definition: ISDOpcodes.h:420
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1258
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1392
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:635
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1172
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:438
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1385
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:894
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1037
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1287
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:90
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1286
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:443
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:432
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1232
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:870
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:433
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1350
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1269
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1236
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ STRICT_LRINT
Definition: ISDOpcodes.h:441
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:599
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ STRICT_FROUND
Definition: ISDOpcodes.h:436
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:457
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1393
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:435
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:437
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1284
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:451
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:473
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:450
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1285
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1203
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:478
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:665
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:428
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:442
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:620
@ STRICT_FEXP2
Definition: ISDOpcodes.h:426
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1283
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:959
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:112
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ STRICT_LLROUND
Definition: ISDOpcodes.h:440
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:845
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1422
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:431
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:869
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1381
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1167
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1091
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:777
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:430
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:588
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1625
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1516
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1503
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1534
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1505
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:255
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:359
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1522
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:346
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:317
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:428
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:300
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64