LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157/// Value type used for condition codes.
158static const MVT MVT_CC = MVT::i32;
159
160static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
161 AArch64::X3, AArch64::X4, AArch64::X5,
162 AArch64::X6, AArch64::X7};
163static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
164 AArch64::Q3, AArch64::Q4, AArch64::Q5,
165 AArch64::Q6, AArch64::Q7};
166
168
170
171static inline EVT getPackedSVEVectorVT(EVT VT) {
172 switch (VT.getSimpleVT().SimpleTy) {
173 default:
174 llvm_unreachable("unexpected element type for vector");
175 case MVT::i8:
176 return MVT::nxv16i8;
177 case MVT::i16:
178 return MVT::nxv8i16;
179 case MVT::i32:
180 return MVT::nxv4i32;
181 case MVT::i64:
182 return MVT::nxv2i64;
183 case MVT::f16:
184 return MVT::nxv8f16;
185 case MVT::f32:
186 return MVT::nxv4f32;
187 case MVT::f64:
188 return MVT::nxv2f64;
189 case MVT::bf16:
190 return MVT::nxv8bf16;
191 }
192}
193
194// NOTE: Currently there's only a need to return integer vector types. If this
195// changes then just add an extra "type" parameter.
197 switch (EC.getKnownMinValue()) {
198 default:
199 llvm_unreachable("unexpected element count for vector");
200 case 16:
201 return MVT::nxv16i8;
202 case 8:
203 return MVT::nxv8i16;
204 case 4:
205 return MVT::nxv4i32;
206 case 2:
207 return MVT::nxv2i64;
208 }
209}
210
212 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
213 "Expected scalable predicate vector type!");
214 switch (VT.getVectorMinNumElements()) {
215 default:
216 llvm_unreachable("unexpected element count for vector");
217 case 2:
218 return MVT::nxv2i64;
219 case 4:
220 return MVT::nxv4i32;
221 case 8:
222 return MVT::nxv8i16;
223 case 16:
224 return MVT::nxv16i8;
225 }
226}
227
228/// Returns true if VT's elements occupy the lowest bit positions of its
229/// associated register class without any intervening space.
230///
231/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
232/// same register class, but only nxv8f16 can be treated as a packed vector.
233static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235 "Expected legal vector type!");
236 return VT.isFixedLengthVector() ||
238}
239
240// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
241// predicate and end with a passthru value matching the result type.
242static bool isMergePassthruOpcode(unsigned Opc) {
243 switch (Opc) {
244 default:
245 return false;
275 return true;
276 }
277}
278
279// Returns true if inactive lanes are known to be zeroed by construction.
281 switch (Op.getOpcode()) {
282 default:
283 return false;
284 // We guarantee i1 splat_vectors to zero the other lanes
288 return true;
290 switch (Op.getConstantOperandVal(0)) {
291 default:
292 return false;
293 case Intrinsic::aarch64_sve_ptrue:
294 case Intrinsic::aarch64_sve_pnext:
295 case Intrinsic::aarch64_sve_cmpeq:
296 case Intrinsic::aarch64_sve_cmpne:
297 case Intrinsic::aarch64_sve_cmpge:
298 case Intrinsic::aarch64_sve_cmpgt:
299 case Intrinsic::aarch64_sve_cmphs:
300 case Intrinsic::aarch64_sve_cmphi:
301 case Intrinsic::aarch64_sve_cmpeq_wide:
302 case Intrinsic::aarch64_sve_cmpne_wide:
303 case Intrinsic::aarch64_sve_cmpge_wide:
304 case Intrinsic::aarch64_sve_cmpgt_wide:
305 case Intrinsic::aarch64_sve_cmplt_wide:
306 case Intrinsic::aarch64_sve_cmple_wide:
307 case Intrinsic::aarch64_sve_cmphs_wide:
308 case Intrinsic::aarch64_sve_cmphi_wide:
309 case Intrinsic::aarch64_sve_cmplo_wide:
310 case Intrinsic::aarch64_sve_cmpls_wide:
311 case Intrinsic::aarch64_sve_fcmpeq:
312 case Intrinsic::aarch64_sve_fcmpne:
313 case Intrinsic::aarch64_sve_fcmpge:
314 case Intrinsic::aarch64_sve_fcmpgt:
315 case Intrinsic::aarch64_sve_fcmpuo:
316 case Intrinsic::aarch64_sve_facgt:
317 case Intrinsic::aarch64_sve_facge:
318 case Intrinsic::aarch64_sve_whilege:
319 case Intrinsic::aarch64_sve_whilegt:
320 case Intrinsic::aarch64_sve_whilehi:
321 case Intrinsic::aarch64_sve_whilehs:
322 case Intrinsic::aarch64_sve_whilele:
323 case Intrinsic::aarch64_sve_whilelo:
324 case Intrinsic::aarch64_sve_whilels:
325 case Intrinsic::aarch64_sve_whilelt:
326 case Intrinsic::aarch64_sve_match:
327 case Intrinsic::aarch64_sve_nmatch:
328 case Intrinsic::aarch64_sve_whilege_x2:
329 case Intrinsic::aarch64_sve_whilegt_x2:
330 case Intrinsic::aarch64_sve_whilehi_x2:
331 case Intrinsic::aarch64_sve_whilehs_x2:
332 case Intrinsic::aarch64_sve_whilele_x2:
333 case Intrinsic::aarch64_sve_whilelo_x2:
334 case Intrinsic::aarch64_sve_whilels_x2:
335 case Intrinsic::aarch64_sve_whilelt_x2:
336 return true;
337 }
338 }
339}
340
341static std::tuple<SDValue, SDValue>
343 SDLoc DL(Disc);
344 SDValue AddrDisc;
345 SDValue ConstDisc;
346
347 // If this is a blend, remember the constant and address discriminators.
348 // Otherwise, it's either a constant discriminator, or a non-blended
349 // address discriminator.
350 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
351 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
352 AddrDisc = Disc->getOperand(1);
353 ConstDisc = Disc->getOperand(2);
354 } else {
355 ConstDisc = Disc;
356 }
357
358 // If the constant discriminator (either the blend RHS, or the entire
359 // discriminator value) isn't a 16-bit constant, bail out, and let the
360 // discriminator be computed separately.
361 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
362 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
363 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
364
365 // If there's no address discriminator, use NoRegister, which we'll later
366 // replace with XZR, or directly use a Z variant of the inst. when available.
367 if (!AddrDisc)
368 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
369
370 return std::make_tuple(
371 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
372 AddrDisc);
373}
374
376 const AArch64Subtarget &STI)
377 : TargetLowering(TM), Subtarget(&STI) {
378 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
379 // we have to make something up. Arbitrarily, choose ZeroOrOne.
381 // When comparing vectors the result sets the different elements in the
382 // vector to all-one or all-zero.
384
385 // Set up the register classes.
386 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
387 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
388
389 if (Subtarget->hasLS64()) {
390 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
391 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
393 }
394
395 if (Subtarget->hasFPARMv8()) {
396 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
397 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
399 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
400 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
401 }
402
403 if (Subtarget->hasNEON()) {
404 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
405 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
406
407 addDRType(MVT::v2f32);
408 addDRType(MVT::v8i8);
409 addDRType(MVT::v4i16);
410 addDRType(MVT::v2i32);
411 addDRType(MVT::v1i64);
412 addDRType(MVT::v1f64);
413 addDRType(MVT::v4f16);
414 addDRType(MVT::v4bf16);
415
416 addQRType(MVT::v4f32);
417 addQRType(MVT::v2f64);
418 addQRType(MVT::v16i8);
419 addQRType(MVT::v8i16);
420 addQRType(MVT::v4i32);
421 addQRType(MVT::v2i64);
422 addQRType(MVT::v8f16);
423 addQRType(MVT::v8bf16);
424 }
425
426 if (Subtarget->isSVEorStreamingSVEAvailable()) {
427 // Add legal sve predicate types
428 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
429 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
433
434 // Add legal sve data types
435 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
436 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
439
440 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
441 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
446
447 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
448 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
450
451 if (Subtarget->useSVEForFixedLengthVectors()) {
454 addRegisterClass(VT, &AArch64::ZPRRegClass);
455
458 addRegisterClass(VT, &AArch64::ZPRRegClass);
459 }
460 }
461
462 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
463 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
464 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
465 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
466
467 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
468 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
469 }
470
471 // Compute derived properties from the register classes
473
474 // Provide all sorts of operation actions
513
515
519
523
525
526 // Custom lowering hooks are needed for XOR
527 // to fold it into CSINC/CSINV.
530
531 // Virtually no operation on f128 is legal, but LLVM can't expand them when
532 // there's a valid register class, so we need custom operations in most cases.
557 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
558 // aren't handled.
559
560 // Lowering for many of the conversions is actually specified by the non-f128
561 // type. The LowerXXX function will be trivial when f128 isn't involved.
586 if (Subtarget->hasFPARMv8()) {
589 }
592 if (Subtarget->hasFPARMv8()) {
595 }
598
603
604 // Variable arguments.
609
610 // Variable-sized objects.
613
614 // Lowering Funnel Shifts to EXTR
619
621
622 // Constant pool entries
624
625 // BlockAddress
627
628 // AArch64 lacks both left-rotate and popcount instructions.
634 }
635
636 // AArch64 doesn't have i32 MULH{S|U}.
639
640 // AArch64 doesn't have {U|S}MUL_LOHI.
645
646 if (Subtarget->hasCSSC()) {
650
652
656
659
664
669 } else {
673
676
679 }
680
686 }
693
694 // Custom lower Add/Sub/Mul with overflow.
707
716
725 if (Subtarget->hasFullFP16()) {
728 } else {
731 }
732
733 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
741 setOperationAction(Op, MVT::f16, Promote);
742 setOperationAction(Op, MVT::v4f16, Expand);
743 setOperationAction(Op, MVT::v8f16, Expand);
744 setOperationAction(Op, MVT::bf16, Promote);
745 setOperationAction(Op, MVT::v4bf16, Expand);
746 setOperationAction(Op, MVT::v8bf16, Expand);
747 }
748
749 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
750 for (auto Op : {
754 ISD::FADD,
755 ISD::FSUB,
756 ISD::FMUL,
757 ISD::FDIV,
758 ISD::FMA,
788 })
789 setOperationAction(Op, ScalarVT, Promote);
790
791 for (auto Op : {ISD::FNEG, ISD::FABS})
792 setOperationAction(Op, ScalarVT, Legal);
793
794 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
795 // because the result type is integer.
799 setOperationAction(Op, ScalarVT, Custom);
800
801 // promote v4f16 to v4f32 when that is known to be safe.
802 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
803 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
804 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
805 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
806 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
807 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
808 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
809 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
810 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
811 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
812 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
813 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
814
824
825 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
847 };
848
849 if (!Subtarget->hasFullFP16()) {
850 LegalizeNarrowFP(MVT::f16);
851 }
852 LegalizeNarrowFP(MVT::bf16);
855
856 // AArch64 has implementations of a lot of rounding-like FP operations.
857 for (auto Op :
868 for (MVT Ty : {MVT::f32, MVT::f64})
870 if (Subtarget->hasFullFP16())
871 setOperationAction(Op, MVT::f16, Legal);
872 }
873
874 // Basic strict FP operations are legal
877 for (MVT Ty : {MVT::f32, MVT::f64})
879 if (Subtarget->hasFullFP16())
880 setOperationAction(Op, MVT::f16, Legal);
881 }
882
883 // Strict conversion to a larger type is legal
884 for (auto VT : {MVT::f32, MVT::f64})
886
888
894
896 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
899 } else {
902 }
905
906 // Generate outline atomics library calls only if LSE was not specified for
907 // subtarget
908 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
934#define LCALLNAMES(A, B, N) \
935 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
936 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
937 setLibcallName(A##N##_REL, #B #N "_rel"); \
938 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
939#define LCALLNAME4(A, B) \
940 LCALLNAMES(A, B, 1) \
941 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
942#define LCALLNAME5(A, B) \
943 LCALLNAMES(A, B, 1) \
944 LCALLNAMES(A, B, 2) \
945 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
946 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
947 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
948 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
949 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
950 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
951 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
952#undef LCALLNAMES
953#undef LCALLNAME4
954#undef LCALLNAME5
955 }
956
957 if (Subtarget->hasLSE128()) {
958 // Custom lowering because i128 is not legal. Must be replaced by 2x64
959 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
963 }
964
965 // 128-bit loads and stores can be done without expanding
968
969 // Aligned 128-bit loads and stores are single-copy atomic according to the
970 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
971 if (Subtarget->hasLSE2()) {
974 }
975
976 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
977 // custom lowering, as there are no un-paired non-temporal stores and
978 // legalization will break up 256 bit inputs.
980 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
981 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
982 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
987
988 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
989 // custom lowering, as there are no un-paired non-temporal loads legalization
990 // will break up 256 bit inputs.
991 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
992 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
993 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
994 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
995 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
996 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
997 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
998 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
999
1000 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1002
1003 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1004 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1005 // Issue __sincos_stret if available.
1008 } else {
1011 }
1012
1013 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1014 // MSVCRT doesn't have powi; fall back to pow
1015 setLibcallName(RTLIB::POWI_F32, nullptr);
1016 setLibcallName(RTLIB::POWI_F64, nullptr);
1017 }
1018
1019 // Make floating-point constants legal for the large code model, so they don't
1020 // become loads from the constant pool.
1021 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1024 }
1025
1026 // AArch64 does not have floating-point extending loads, i1 sign-extending
1027 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1028 for (MVT VT : MVT::fp_valuetypes()) {
1029 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1030 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1031 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1033 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1034 }
1035 for (MVT VT : MVT::integer_valuetypes())
1036 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1037
1038 for (MVT WideVT : MVT::fp_valuetypes()) {
1039 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1040 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1041 setTruncStoreAction(WideVT, NarrowVT, Expand);
1042 }
1043 }
1044 }
1045
1046 if (Subtarget->hasFPARMv8()) {
1050 }
1051
1052 // Indexed loads and stores are supported.
1053 for (unsigned im = (unsigned)ISD::PRE_INC;
1055 setIndexedLoadAction(im, MVT::i8, Legal);
1056 setIndexedLoadAction(im, MVT::i16, Legal);
1057 setIndexedLoadAction(im, MVT::i32, Legal);
1058 setIndexedLoadAction(im, MVT::i64, Legal);
1059 setIndexedLoadAction(im, MVT::f64, Legal);
1060 setIndexedLoadAction(im, MVT::f32, Legal);
1061 setIndexedLoadAction(im, MVT::f16, Legal);
1062 setIndexedLoadAction(im, MVT::bf16, Legal);
1063 setIndexedStoreAction(im, MVT::i8, Legal);
1064 setIndexedStoreAction(im, MVT::i16, Legal);
1065 setIndexedStoreAction(im, MVT::i32, Legal);
1066 setIndexedStoreAction(im, MVT::i64, Legal);
1067 setIndexedStoreAction(im, MVT::f64, Legal);
1068 setIndexedStoreAction(im, MVT::f32, Legal);
1069 setIndexedStoreAction(im, MVT::f16, Legal);
1070 setIndexedStoreAction(im, MVT::bf16, Legal);
1071 }
1072
1073 // Trap.
1074 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1077
1078 // We combine OR nodes for bitfield operations.
1080 // Try to create BICs for vector ANDs.
1082
1083 // Vector add and sub nodes may conceal a high-half opportunity.
1084 // Also, try to fold ADD into CSINC/CSINV..
1087
1090
1091 // Try and combine setcc with csel
1093
1095
1102
1104
1106
1108
1112
1114
1116
1118
1120
1124
1126
1127 // In case of strict alignment, avoid an excessive number of byte wide stores.
1130 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1131
1135 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1136
1139
1142 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1143
1145
1147
1148 EnableExtLdPromotion = true;
1149
1150 // Set required alignment.
1152 // Set preferred alignments.
1153
1154 // Don't align loops on Windows. The SEH unwind info generation needs to
1155 // know the exact length of functions before the alignments have been
1156 // expanded.
1157 if (!Subtarget->isTargetWindows())
1161
1162 // Only change the limit for entries in a jump table if specified by
1163 // the sub target, but not at the command line.
1164 unsigned MaxJT = STI.getMaximumJumpTableSize();
1165 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1167
1169
1171
1173
1174 if (Subtarget->isNeonAvailable()) {
1175 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1176 // silliness like this:
1177 // clang-format off
1178 for (auto Op :
1196 setOperationAction(Op, MVT::v1f64, Expand);
1197 // clang-format on
1198 for (auto Op :
1203 setOperationAction(Op, MVT::v1i64, Expand);
1204
1205 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1206 // elements smaller than i32, so promote the input to i32 first.
1207 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1208 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1209
1210 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1211 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1212 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1215 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1217
1218 if (Subtarget->hasFullFP16()) {
1221
1230 } else {
1231 // when AArch64 doesn't have fullfp16 support, promote the input
1232 // to i32 first.
1233 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1234 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1235 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1236 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1237 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1238 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1239 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1240 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1241 }
1242
1243 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1244 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1251 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1256 }
1257
1258 // Custom handling for some quad-vector types to detect MULL.
1259 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1260 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1261 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1262 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1263 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1264 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1265
1266 // Saturates
1267 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1268 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1273 }
1274
1275 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1276 MVT::v4i32}) {
1283 }
1284
1285 // Vector reductions
1286 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1287 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1288 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1293
1295 }
1296 }
1297 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1298 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1307 }
1312
1314 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1315 // Likewise, narrowing and extending vector loads/stores aren't handled
1316 // directly.
1319
1320 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1323 } else {
1326 }
1329
1332
1333 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1334 setTruncStoreAction(VT, InnerVT, Expand);
1335 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1336 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1337 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1338 }
1339 }
1340
1341 // AArch64 has implementations of a lot of rounding-like FP operations.
1342 for (auto Op :
1347 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1349 if (Subtarget->hasFullFP16())
1350 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1352 }
1353
1354 // LRINT and LLRINT.
1355 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1356 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1358 if (Subtarget->hasFullFP16())
1359 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1361 }
1362
1363 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1364
1369
1373
1374 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1375 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1376 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1377 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1378 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1379 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1380
1381 // ADDP custom lowering
1382 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1384 // FADDP custom lowering
1385 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1387 } else /* !isNeonAvailable */ {
1389 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1391
1392 if (VT.is128BitVector() || VT.is64BitVector()) {
1396 Subtarget->isLittleEndian() ? Legal : Expand);
1397 }
1398 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1399 setTruncStoreAction(VT, InnerVT, Expand);
1400 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1401 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1403 }
1404 }
1405 }
1406
1407 if (Subtarget->hasSME()) {
1409 }
1410
1411 // FIXME: Move lowering for more nodes here if those are common between
1412 // SVE and SME.
1413 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1414 for (auto VT :
1415 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1420 }
1421 }
1422
1423 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1424 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1465
1471
1480
1485
1486 if (!Subtarget->isLittleEndian())
1488
1489 if (Subtarget->hasSVE2() ||
1490 (Subtarget->hasSME() && Subtarget->isStreaming()))
1491 // For SLI/SRI.
1493 }
1494
1495 // Illegal unpacked integer vector types.
1496 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1499 }
1500
1501 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1502 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1503 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1505
1506 for (auto VT :
1507 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1508 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1510
1511 for (auto VT :
1512 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1520
1524
1525 // There are no legal MVT::nxv16f## based types.
1526 if (VT != MVT::nxv16i1) {
1529 }
1530 }
1531
1532 // NEON doesn't support masked loads/stores, but SME and SVE do.
1533 for (auto VT :
1534 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1535 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1536 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1539 }
1540
1541 // Firstly, exclude all scalable vector extending loads/truncating stores,
1542 // include both integer and floating scalable vector.
1544 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1545 setTruncStoreAction(VT, InnerVT, Expand);
1546 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1547 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1548 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1549 }
1550 }
1551
1552 // Then, selectively enable those which we directly support.
1553 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1554 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1555 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1556 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1557 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1558 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1559 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1560 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1561 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1562 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1563 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1564 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1565 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1566 }
1567
1568 // SVE supports truncating stores of 64 and 128-bit vectors
1569 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1570 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1571 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1572 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1573 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1574
1575 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1576 MVT::nxv4f32, MVT::nxv2f64}) {
1615
1630
1642
1643 if (!Subtarget->isLittleEndian())
1645 }
1646
1647 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1653
1654 if (!Subtarget->isLittleEndian())
1656 }
1657
1660
1661 // NEON doesn't support integer divides, but SVE does
1662 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1663 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1666 }
1667
1668 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1669 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1670 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1671
1672 // NOTE: Currently this has to happen after computeRegisterProperties rather
1673 // than the preferred option of combining it with the addRegisterClass call.
1674 if (Subtarget->useSVEForFixedLengthVectors()) {
1677 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1678 addTypeForFixedLengthSVE(VT);
1679 }
1682 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1683 addTypeForFixedLengthSVE(VT);
1684 }
1685
1686 // 64bit results can mean a bigger than NEON input.
1687 for (auto VT : {MVT::v8i8, MVT::v4i16})
1690
1691 // 128bit results imply a bigger than NEON input.
1692 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1694 for (auto VT : {MVT::v8f16, MVT::v4f32})
1696
1697 // These operations are not supported on NEON but SVE can do them.
1699 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1700 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1701 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1702 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1703 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1704 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1705 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1706 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1707 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1708 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1709 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1710 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1711 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1712 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1713 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1718
1719 // Int operations with no NEON support.
1720 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1721 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1729 }
1730
1731 // Use SVE for vectors with more than 2 elements.
1732 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1734 }
1735
1736 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1737 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1738 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1739 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1740
1742
1743 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1745 }
1746
1747 // Handle operations that are only available in non-streaming SVE mode.
1748 if (Subtarget->isSVEAvailable()) {
1749 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1750 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1751 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1752 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1753 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1754 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1755 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1758 }
1759
1760 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1761 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1762 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1764
1765 // Histcnt is SVE2 only
1766 if (Subtarget->hasSVE2())
1768 Custom);
1769 }
1770
1771
1772 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1773 // Only required for llvm.aarch64.mops.memset.tag
1775 }
1776
1778
1779 if (Subtarget->hasSVE()) {
1784 }
1785
1786 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1787
1788 IsStrictFPEnabled = true;
1790
1791 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1792 // it, but it's just a wrapper around ldexp.
1793 if (Subtarget->isTargetWindows()) {
1795 if (isOperationExpand(Op, MVT::f32))
1796 setOperationAction(Op, MVT::f32, Promote);
1797 }
1798
1799 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1800 // isn't legal.
1802 if (isOperationExpand(Op, MVT::f16))
1803 setOperationAction(Op, MVT::f16, Promote);
1804
1805 if (Subtarget->isWindowsArm64EC()) {
1806 // FIXME: are there intrinsics we need to exclude from this?
1807 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1808 auto code = static_cast<RTLIB::Libcall>(i);
1809 auto libcallName = getLibcallName(code);
1810 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1811 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1812 }
1813 }
1814 }
1815}
1816
1817void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1818 assert(VT.isVector() && "VT should be a vector type");
1819
1820 if (VT.isFloatingPoint()) {
1822 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1823 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1824 }
1825
1826 // Mark vector float intrinsics as expand.
1827 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1838 }
1839
1840 // But we do support custom-lowering for FCOPYSIGN.
1841 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1842 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1843 VT == MVT::v8f16) &&
1844 Subtarget->hasFullFP16()))
1846
1859
1863 for (MVT InnerVT : MVT::all_valuetypes())
1864 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1865
1866 // CNT supports only B element sizes, then use UADDLP to widen.
1867 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1869
1875
1876 for (unsigned Opcode :
1879 setOperationAction(Opcode, VT, Custom);
1880
1881 if (!VT.isFloatingPoint())
1883
1884 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1885 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1886 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1887 setOperationAction(Opcode, VT, Legal);
1888
1889 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1890 // NEON types.
1891 if (VT.isFloatingPoint() &&
1892 VT.getVectorElementType() != MVT::bf16 &&
1893 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1894 for (unsigned Opcode :
1900 setOperationAction(Opcode, VT, Legal);
1901
1902 // Strict fp extend and trunc are legal
1903 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1905 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1907
1908 // FIXME: We could potentially make use of the vector comparison instructions
1909 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1910 // complications:
1911 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1912 // so we would need to expand when the condition code doesn't match the
1913 // kind of comparison.
1914 // * Some kinds of comparison require more than one FCMXY instruction so
1915 // would need to be expanded instead.
1916 // * The lowering of the non-strict versions involves target-specific ISD
1917 // nodes so we would likely need to add strict versions of all of them and
1918 // handle them appropriately.
1921
1922 if (Subtarget->isLittleEndian()) {
1923 for (unsigned im = (unsigned)ISD::PRE_INC;
1927 }
1928 }
1929
1930 if (Subtarget->hasD128()) {
1933 }
1934}
1935
1937 EVT OpVT) const {
1938 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1939 if (!Subtarget->hasSVE())
1940 return true;
1941
1942 // We can only support legal predicate result types. We can use the SVE
1943 // whilelo instruction for generating fixed-width predicates too.
1944 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1945 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1946 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1947 return true;
1948
1949 // The whilelo instruction only works with i32 or i64 scalar inputs.
1950 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1951 return true;
1952
1953 return false;
1954}
1955
1957 if (!Subtarget->isSVEorStreamingSVEAvailable())
1958 return true;
1959
1960 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
1961 // also support fixed-width predicates.
1962 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1963 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1964 VT != MVT::v4i1 && VT != MVT::v2i1;
1965}
1966
1967void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1968 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1969
1970 // By default everything must be expanded.
1971 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1973
1974 if (VT.isFloatingPoint()) {
1984 }
1985
1987 VT == MVT::v1f64 ? Expand : Custom;
1988
1989 // Mark integer truncating stores/extending loads as having custom lowering
1990 if (VT.isInteger()) {
1991 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1992 while (InnerVT != VT) {
1993 setTruncStoreAction(VT, InnerVT, Default);
1994 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1995 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1996 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1997 InnerVT = InnerVT.changeVectorElementType(
1998 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1999 }
2000 }
2001
2002 // Mark floating-point truncating stores/extending loads as having custom
2003 // lowering
2004 if (VT.isFloatingPoint()) {
2005 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2006 while (InnerVT != VT) {
2007 setTruncStoreAction(VT, InnerVT, Custom);
2008 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2009 InnerVT = InnerVT.changeVectorElementType(
2011 }
2012 }
2013
2014 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2015 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2016
2017 // Lower fixed length vector operations to scalable equivalents.
2022 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2059 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2060 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2062 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2081 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2107}
2108
2109void AArch64TargetLowering::addDRType(MVT VT) {
2110 addRegisterClass(VT, &AArch64::FPR64RegClass);
2111 if (Subtarget->isNeonAvailable())
2112 addTypeForNEON(VT);
2113}
2114
2115void AArch64TargetLowering::addQRType(MVT VT) {
2116 addRegisterClass(VT, &AArch64::FPR128RegClass);
2117 if (Subtarget->isNeonAvailable())
2118 addTypeForNEON(VT);
2119}
2120
2122 LLVMContext &C, EVT VT) const {
2123 if (!VT.isVector())
2124 return MVT::i32;
2125 if (VT.isScalableVector())
2126 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2128}
2129
2130// isIntImmediate - This method tests to see if the node is a constant
2131// operand. If so Imm will receive the value.
2132static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2133 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2134 Imm = C->getZExtValue();
2135 return true;
2136 }
2137 return false;
2138}
2139
2140// isOpcWithIntImmediate - This method tests to see if the node is a specific
2141// opcode and that it has a immediate integer right operand.
2142// If so Imm will receive the value.
2143static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2144 uint64_t &Imm) {
2145 return N->getOpcode() == Opc &&
2146 isIntImmediate(N->getOperand(1).getNode(), Imm);
2147}
2148
2149static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2150 const APInt &Demanded,
2152 unsigned NewOpc) {
2153 uint64_t OldImm = Imm, NewImm, Enc;
2154 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2155
2156 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2157 // bimm64.
2158 if (Imm == 0 || Imm == Mask ||
2160 return false;
2161
2162 unsigned EltSize = Size;
2163 uint64_t DemandedBits = Demanded.getZExtValue();
2164
2165 // Clear bits that are not demanded.
2166 Imm &= DemandedBits;
2167
2168 while (true) {
2169 // The goal here is to set the non-demanded bits in a way that minimizes
2170 // the number of switching between 0 and 1. In order to achieve this goal,
2171 // we set the non-demanded bits to the value of the preceding demanded bits.
2172 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2173 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2174 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2175 // The final result is 0b11000011.
2176 uint64_t NonDemandedBits = ~DemandedBits;
2177 uint64_t InvertedImm = ~Imm & DemandedBits;
2178 uint64_t RotatedImm =
2179 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2180 NonDemandedBits;
2181 uint64_t Sum = RotatedImm + NonDemandedBits;
2182 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2183 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2184 NewImm = (Imm | Ones) & Mask;
2185
2186 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2187 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2188 // we halve the element size and continue the search.
2189 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2190 break;
2191
2192 // We cannot shrink the element size any further if it is 2-bits.
2193 if (EltSize == 2)
2194 return false;
2195
2196 EltSize /= 2;
2197 Mask >>= EltSize;
2198 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2199
2200 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2201 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2202 return false;
2203
2204 // Merge the upper and lower halves of Imm and DemandedBits.
2205 Imm |= Hi;
2206 DemandedBits |= DemandedBitsHi;
2207 }
2208
2209 ++NumOptimizedImms;
2210
2211 // Replicate the element across the register width.
2212 while (EltSize < Size) {
2213 NewImm |= NewImm << EltSize;
2214 EltSize *= 2;
2215 }
2216
2217 (void)OldImm;
2218 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2219 "demanded bits should never be altered");
2220 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2221
2222 // Create the new constant immediate node.
2223 EVT VT = Op.getValueType();
2224 SDLoc DL(Op);
2225 SDValue New;
2226
2227 // If the new constant immediate is all-zeros or all-ones, let the target
2228 // independent DAG combine optimize this node.
2229 if (NewImm == 0 || NewImm == OrigMask) {
2230 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2231 TLO.DAG.getConstant(NewImm, DL, VT));
2232 // Otherwise, create a machine node so that target independent DAG combine
2233 // doesn't undo this optimization.
2234 } else {
2236 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2237 New = SDValue(
2238 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2239 }
2240
2241 return TLO.CombineTo(Op, New);
2242}
2243
2245 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2246 TargetLoweringOpt &TLO) const {
2247 // Delay this optimization to as late as possible.
2248 if (!TLO.LegalOps)
2249 return false;
2250
2252 return false;
2253
2254 EVT VT = Op.getValueType();
2255 if (VT.isVector())
2256 return false;
2257
2258 unsigned Size = VT.getSizeInBits();
2259 assert((Size == 32 || Size == 64) &&
2260 "i32 or i64 is expected after legalization.");
2261
2262 // Exit early if we demand all bits.
2263 if (DemandedBits.popcount() == Size)
2264 return false;
2265
2266 unsigned NewOpc;
2267 switch (Op.getOpcode()) {
2268 default:
2269 return false;
2270 case ISD::AND:
2271 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2272 break;
2273 case ISD::OR:
2274 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2275 break;
2276 case ISD::XOR:
2277 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2278 break;
2279 }
2280 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2281 if (!C)
2282 return false;
2283 uint64_t Imm = C->getZExtValue();
2284 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2285}
2286
2287/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2288/// Mask are known to be either zero or one and return them Known.
2290 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2291 const SelectionDAG &DAG, unsigned Depth) const {
2292 switch (Op.getOpcode()) {
2293 default:
2294 break;
2295 case AArch64ISD::DUP: {
2296 SDValue SrcOp = Op.getOperand(0);
2297 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2298 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2299 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2300 "Expected DUP implicit truncation");
2301 Known = Known.trunc(Op.getScalarValueSizeInBits());
2302 }
2303 break;
2304 }
2305 case AArch64ISD::CSEL: {
2306 KnownBits Known2;
2307 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2308 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2309 Known = Known.intersectWith(Known2);
2310 break;
2311 }
2312 case AArch64ISD::BICi: {
2313 // Compute the bit cleared value.
2314 uint64_t Mask =
2315 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2316 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2317 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2318 break;
2319 }
2320 case AArch64ISD::VLSHR: {
2321 KnownBits Known2;
2322 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2323 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2324 Known = KnownBits::lshr(Known, Known2);
2325 break;
2326 }
2327 case AArch64ISD::VASHR: {
2328 KnownBits Known2;
2329 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2330 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2331 Known = KnownBits::ashr(Known, Known2);
2332 break;
2333 }
2334 case AArch64ISD::VSHL: {
2335 KnownBits Known2;
2336 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2337 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2338 Known = KnownBits::shl(Known, Known2);
2339 break;
2340 }
2341 case AArch64ISD::MOVI: {
2343 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2344 break;
2345 }
2347 case AArch64ISD::ADDlow: {
2348 if (!Subtarget->isTargetILP32())
2349 break;
2350 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2351 Known.Zero = APInt::getHighBitsSet(64, 32);
2352 break;
2353 }
2355 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2356 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2357 break;
2358 }
2360 Intrinsic::ID IntID =
2361 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2362 switch (IntID) {
2363 default: return;
2364 case Intrinsic::aarch64_ldaxr:
2365 case Intrinsic::aarch64_ldxr: {
2366 unsigned BitWidth = Known.getBitWidth();
2367 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2368 unsigned MemBits = VT.getScalarSizeInBits();
2369 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2370 return;
2371 }
2372 }
2373 break;
2374 }
2376 case ISD::INTRINSIC_VOID: {
2377 unsigned IntNo = Op.getConstantOperandVal(0);
2378 switch (IntNo) {
2379 default:
2380 break;
2381 case Intrinsic::aarch64_neon_uaddlv: {
2382 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2383 unsigned BitWidth = Known.getBitWidth();
2384 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2385 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2386 assert(BitWidth >= Bound && "Unexpected width!");
2388 Known.Zero |= Mask;
2389 }
2390 break;
2391 }
2392 case Intrinsic::aarch64_neon_umaxv:
2393 case Intrinsic::aarch64_neon_uminv: {
2394 // Figure out the datatype of the vector operand. The UMINV instruction
2395 // will zero extend the result, so we can mark as known zero all the
2396 // bits larger than the element datatype. 32-bit or larget doesn't need
2397 // this as those are legal types and will be handled by isel directly.
2398 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2399 unsigned BitWidth = Known.getBitWidth();
2400 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2401 assert(BitWidth >= 8 && "Unexpected width!");
2403 Known.Zero |= Mask;
2404 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2405 assert(BitWidth >= 16 && "Unexpected width!");
2407 Known.Zero |= Mask;
2408 }
2409 break;
2410 } break;
2411 }
2412 }
2413 }
2414}
2415
2417 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2418 unsigned Depth) const {
2419 EVT VT = Op.getValueType();
2420 unsigned VTBits = VT.getScalarSizeInBits();
2421 unsigned Opcode = Op.getOpcode();
2422 switch (Opcode) {
2423 case AArch64ISD::CMEQ:
2424 case AArch64ISD::CMGE:
2425 case AArch64ISD::CMGT:
2426 case AArch64ISD::CMHI:
2427 case AArch64ISD::CMHS:
2428 case AArch64ISD::FCMEQ:
2429 case AArch64ISD::FCMGE:
2430 case AArch64ISD::FCMGT:
2431 case AArch64ISD::CMEQz:
2432 case AArch64ISD::CMGEz:
2433 case AArch64ISD::CMGTz:
2434 case AArch64ISD::CMLEz:
2435 case AArch64ISD::CMLTz:
2436 case AArch64ISD::FCMEQz:
2437 case AArch64ISD::FCMGEz:
2438 case AArch64ISD::FCMGTz:
2439 case AArch64ISD::FCMLEz:
2440 case AArch64ISD::FCMLTz:
2441 // Compares return either 0 or all-ones
2442 return VTBits;
2443 }
2444
2445 return 1;
2446}
2447
2449 EVT) const {
2450 return MVT::i64;
2451}
2452
2454 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2455 unsigned *Fast) const {
2456 if (Subtarget->requiresStrictAlign())
2457 return false;
2458
2459 if (Fast) {
2460 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2461 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2462 // See comments in performSTORECombine() for more details about
2463 // these conditions.
2464
2465 // Code that uses clang vector extensions can mark that it
2466 // wants unaligned accesses to be treated as fast by
2467 // underspecifying alignment to be 1 or 2.
2468 Alignment <= 2 ||
2469
2470 // Disregard v2i64. Memcpy lowering produces those and splitting
2471 // them regresses performance on micro-benchmarks and olden/bh.
2472 VT == MVT::v2i64;
2473 }
2474 return true;
2475}
2476
2477// Same as above but handling LLTs instead.
2479 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2480 unsigned *Fast) const {
2481 if (Subtarget->requiresStrictAlign())
2482 return false;
2483
2484 if (Fast) {
2485 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2486 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2487 Ty.getSizeInBytes() != 16 ||
2488 // See comments in performSTORECombine() for more details about
2489 // these conditions.
2490
2491 // Code that uses clang vector extensions can mark that it
2492 // wants unaligned accesses to be treated as fast by
2493 // underspecifying alignment to be 1 or 2.
2494 Alignment <= 2 ||
2495
2496 // Disregard v2i64. Memcpy lowering produces those and splitting
2497 // them regresses performance on micro-benchmarks and olden/bh.
2498 Ty == LLT::fixed_vector(2, 64);
2499 }
2500 return true;
2501}
2502
2503FastISel *
2505 const TargetLibraryInfo *libInfo) const {
2506 return AArch64::createFastISel(funcInfo, libInfo);
2507}
2508
2509const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2510#define MAKE_CASE(V) \
2511 case V: \
2512 return #V;
2513 switch ((AArch64ISD::NodeType)Opcode) {
2515 break;
2839 }
2840#undef MAKE_CASE
2841 return nullptr;
2842}
2843
2846 MachineBasicBlock *MBB) const {
2847 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2848 // phi node:
2849
2850 // OrigBB:
2851 // [... previous instrs leading to comparison ...]
2852 // b.ne TrueBB
2853 // b EndBB
2854 // TrueBB:
2855 // ; Fallthrough
2856 // EndBB:
2857 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2858
2859 MachineFunction *MF = MBB->getParent();
2860 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2861 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2862 DebugLoc DL = MI.getDebugLoc();
2864
2865 Register DestReg = MI.getOperand(0).getReg();
2866 Register IfTrueReg = MI.getOperand(1).getReg();
2867 Register IfFalseReg = MI.getOperand(2).getReg();
2868 unsigned CondCode = MI.getOperand(3).getImm();
2869 bool NZCVKilled = MI.getOperand(4).isKill();
2870
2871 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2872 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2873 MF->insert(It, TrueBB);
2874 MF->insert(It, EndBB);
2875
2876 // Transfer rest of current basic-block to EndBB
2877 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2878 MBB->end());
2880
2881 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2882 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2883 MBB->addSuccessor(TrueBB);
2884 MBB->addSuccessor(EndBB);
2885
2886 // TrueBB falls through to the end.
2887 TrueBB->addSuccessor(EndBB);
2888
2889 if (!NZCVKilled) {
2890 TrueBB->addLiveIn(AArch64::NZCV);
2891 EndBB->addLiveIn(AArch64::NZCV);
2892 }
2893
2894 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2895 .addReg(IfTrueReg)
2896 .addMBB(TrueBB)
2897 .addReg(IfFalseReg)
2898 .addMBB(MBB);
2899
2900 MI.eraseFromParent();
2901 return EndBB;
2902}
2903
2905 MachineInstr &MI, MachineBasicBlock *BB) const {
2907 BB->getParent()->getFunction().getPersonalityFn())) &&
2908 "SEH does not use catchret!");
2909 return BB;
2910}
2911
2914 MachineBasicBlock *MBB) const {
2915 MachineFunction &MF = *MBB->getParent();
2916 MachineBasicBlock::iterator MBBI = MI.getIterator();
2918 const AArch64InstrInfo &TII =
2919 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2920 Register TargetReg = MI.getOperand(0).getReg();
2922 TII.probedStackAlloc(MBBI, TargetReg, false);
2923
2924 MI.eraseFromParent();
2925 return NextInst->getParent();
2926}
2927
2929AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2931 MachineBasicBlock *BB) const {
2932 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2933 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2934
2935 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2936 MIB.add(MI.getOperand(1)); // slice index register
2937 MIB.add(MI.getOperand(2)); // slice index offset
2938 MIB.add(MI.getOperand(3)); // pg
2939 MIB.add(MI.getOperand(4)); // base
2940 MIB.add(MI.getOperand(5)); // offset
2941
2942 MI.eraseFromParent(); // The pseudo is gone now.
2943 return BB;
2944}
2945
2948 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2950 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2951
2952 MIB.addReg(AArch64::ZA, RegState::Define);
2953 MIB.add(MI.getOperand(0)); // Vector select register
2954 MIB.add(MI.getOperand(1)); // Vector select offset
2955 MIB.add(MI.getOperand(2)); // Base
2956 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2957
2958 MI.eraseFromParent(); // The pseudo is gone now.
2959 return BB;
2960}
2961
2964 unsigned Opcode,
2965 bool Op0IsDef) const {
2966 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2968
2969 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2970 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2971 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2972 MIB.add(MI.getOperand(I));
2973
2974 MI.eraseFromParent(); // The pseudo is gone now.
2975 return BB;
2976}
2977
2979AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2981 MachineBasicBlock *BB) const {
2982 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2983 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2984 unsigned StartIdx = 0;
2985
2986 bool HasTile = BaseReg != AArch64::ZA;
2987 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
2988 if (HasZPROut) {
2989 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
2990 ++StartIdx;
2991 }
2992 if (HasTile) {
2993 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
2994 RegState::Define); // Output ZA Tile
2995 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
2996 StartIdx++;
2997 } else {
2998 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2999 }
3000 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3001 MIB.add(MI.getOperand(I));
3002
3003 MI.eraseFromParent(); // The pseudo is gone now.
3004 return BB;
3005}
3006
3009 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3011 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3012 MIB.add(MI.getOperand(0)); // Mask
3013
3014 unsigned Mask = MI.getOperand(0).getImm();
3015 for (unsigned I = 0; I < 8; I++) {
3016 if (Mask & (1 << I))
3017 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3018 }
3019
3020 MI.eraseFromParent(); // The pseudo is gone now.
3021 return BB;
3022}
3023
3026 MachineBasicBlock *BB) const {
3027 MachineFunction *MF = BB->getParent();
3028 MachineFrameInfo &MFI = MF->getFrameInfo();
3030 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3031 if (TPIDR2.Uses > 0) {
3032 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3033 // Store the buffer pointer to the TPIDR2 stack object.
3034 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3035 .addReg(MI.getOperand(0).getReg())
3036 .addFrameIndex(TPIDR2.FrameIndex)
3037 .addImm(0);
3038 // Set the reserved bytes (10-15) to zero
3039 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3040 .addReg(AArch64::WZR)
3041 .addFrameIndex(TPIDR2.FrameIndex)
3042 .addImm(5);
3043 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3044 .addReg(AArch64::WZR)
3045 .addFrameIndex(TPIDR2.FrameIndex)
3046 .addImm(3);
3047 } else
3048 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3049
3050 BB->remove_instr(&MI);
3051 return BB;
3052}
3053
3056 MachineBasicBlock *BB) const {
3057 MachineFunction *MF = BB->getParent();
3058 MachineFrameInfo &MFI = MF->getFrameInfo();
3060 // TODO This function grows the stack with a subtraction, which doesn't work
3061 // on Windows. Some refactoring to share the functionality in
3062 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3063 // supports SME
3065 "Lazy ZA save is not yet supported on Windows");
3066
3067 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3068
3069 if (TPIDR2.Uses > 0) {
3070 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3072
3073 // The SUBXrs below won't always be emitted in a form that accepts SP
3074 // directly
3075 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3076 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3077 .addReg(AArch64::SP);
3078
3079 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3080 auto Size = MI.getOperand(1).getReg();
3081 auto Dest = MI.getOperand(0).getReg();
3082 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3083 .addReg(Size)
3084 .addReg(Size)
3085 .addReg(SP);
3086 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3087 AArch64::SP)
3088 .addReg(Dest);
3089
3090 // We have just allocated a variable sized object, tell this to PEI.
3091 MFI.CreateVariableSizedObject(Align(16), nullptr);
3092 }
3093
3094 BB->remove_instr(&MI);
3095 return BB;
3096}
3097
3099 MachineInstr &MI, MachineBasicBlock *BB) const {
3100
3101 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3102 if (SMEOrigInstr != -1) {
3103 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3104 uint64_t SMEMatrixType =
3105 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3106 switch (SMEMatrixType) {
3108 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3110 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3112 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3114 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3116 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3118 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3119 }
3120 }
3121
3122 switch (MI.getOpcode()) {
3123 default:
3124#ifndef NDEBUG
3125 MI.dump();
3126#endif
3127 llvm_unreachable("Unexpected instruction for custom inserter!");
3128 case AArch64::InitTPIDR2Obj:
3129 return EmitInitTPIDR2Object(MI, BB);
3130 case AArch64::AllocateZABuffer:
3131 return EmitAllocateZABuffer(MI, BB);
3132 case AArch64::F128CSEL:
3133 return EmitF128CSEL(MI, BB);
3134 case TargetOpcode::STATEPOINT:
3135 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3136 // while bl call instruction (where statepoint will be lowered at the end)
3137 // has implicit def. This def is early-clobber as it will be set at
3138 // the moment of the call and earlier than any use is read.
3139 // Add this implicit dead def here as a workaround.
3140 MI.addOperand(*MI.getMF(),
3142 AArch64::LR, /*isDef*/ true,
3143 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3144 /*isUndef*/ false, /*isEarlyClobber*/ true));
3145 [[fallthrough]];
3146 case TargetOpcode::STACKMAP:
3147 case TargetOpcode::PATCHPOINT:
3148 return emitPatchPoint(MI, BB);
3149
3150 case TargetOpcode::PATCHABLE_EVENT_CALL:
3151 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3152 return BB;
3153
3154 case AArch64::CATCHRET:
3155 return EmitLoweredCatchRet(MI, BB);
3156
3157 case AArch64::PROBED_STACKALLOC_DYN:
3158 return EmitDynamicProbedAlloc(MI, BB);
3159
3160 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3161 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3162 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3163 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3164 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3165 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3166 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3167 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3168 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3169 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3170 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3171 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3172 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3173 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3174 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3175 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3176 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3177 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3178 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3179 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3180 case AArch64::LDR_ZA_PSEUDO:
3181 return EmitFill(MI, BB);
3182 case AArch64::LDR_TX_PSEUDO:
3183 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3184 case AArch64::STR_TX_PSEUDO:
3185 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3186 case AArch64::ZERO_M_PSEUDO:
3187 return EmitZero(MI, BB);
3188 case AArch64::ZERO_T_PSEUDO:
3189 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3190 }
3191}
3192
3193//===----------------------------------------------------------------------===//
3194// AArch64 Lowering private implementation.
3195//===----------------------------------------------------------------------===//
3196
3197//===----------------------------------------------------------------------===//
3198// Lowering Code
3199//===----------------------------------------------------------------------===//
3200
3201// Forward declarations of SVE fixed length lowering helpers
3206 SelectionDAG &DAG);
3209 EVT VT);
3210
3211/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3212static bool isZerosVector(const SDNode *N) {
3213 // Look through a bit convert.
3214 while (N->getOpcode() == ISD::BITCAST)
3215 N = N->getOperand(0).getNode();
3216
3218 return true;
3219
3220 if (N->getOpcode() != AArch64ISD::DUP)
3221 return false;
3222
3223 auto Opnd0 = N->getOperand(0);
3224 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3225}
3226
3227/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3228/// CC
3230 switch (CC) {
3231 default:
3232 llvm_unreachable("Unknown condition code!");
3233 case ISD::SETNE:
3234 return AArch64CC::NE;
3235 case ISD::SETEQ:
3236 return AArch64CC::EQ;
3237 case ISD::SETGT:
3238 return AArch64CC::GT;
3239 case ISD::SETGE:
3240 return AArch64CC::GE;
3241 case ISD::SETLT:
3242 return AArch64CC::LT;
3243 case ISD::SETLE:
3244 return AArch64CC::LE;
3245 case ISD::SETUGT:
3246 return AArch64CC::HI;
3247 case ISD::SETUGE:
3248 return AArch64CC::HS;
3249 case ISD::SETULT:
3250 return AArch64CC::LO;
3251 case ISD::SETULE:
3252 return AArch64CC::LS;
3253 }
3254}
3255
3256/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3258 AArch64CC::CondCode &CondCode,
3259 AArch64CC::CondCode &CondCode2) {
3260 CondCode2 = AArch64CC::AL;
3261 switch (CC) {
3262 default:
3263 llvm_unreachable("Unknown FP condition!");
3264 case ISD::SETEQ:
3265 case ISD::SETOEQ:
3266 CondCode = AArch64CC::EQ;
3267 break;
3268 case ISD::SETGT:
3269 case ISD::SETOGT:
3270 CondCode = AArch64CC::GT;
3271 break;
3272 case ISD::SETGE:
3273 case ISD::SETOGE:
3274 CondCode = AArch64CC::GE;
3275 break;
3276 case ISD::SETOLT:
3277 CondCode = AArch64CC::MI;
3278 break;
3279 case ISD::SETOLE:
3280 CondCode = AArch64CC::LS;
3281 break;
3282 case ISD::SETONE:
3283 CondCode = AArch64CC::MI;
3284 CondCode2 = AArch64CC::GT;
3285 break;
3286 case ISD::SETO:
3287 CondCode = AArch64CC::VC;
3288 break;
3289 case ISD::SETUO:
3290 CondCode = AArch64CC::VS;
3291 break;
3292 case ISD::SETUEQ:
3293 CondCode = AArch64CC::EQ;
3294 CondCode2 = AArch64CC::VS;
3295 break;
3296 case ISD::SETUGT:
3297 CondCode = AArch64CC::HI;
3298 break;
3299 case ISD::SETUGE:
3300 CondCode = AArch64CC::PL;
3301 break;
3302 case ISD::SETLT:
3303 case ISD::SETULT:
3304 CondCode = AArch64CC::LT;
3305 break;
3306 case ISD::SETLE:
3307 case ISD::SETULE:
3308 CondCode = AArch64CC::LE;
3309 break;
3310 case ISD::SETNE:
3311 case ISD::SETUNE:
3312 CondCode = AArch64CC::NE;
3313 break;
3314 }
3315}
3316
3317/// Convert a DAG fp condition code to an AArch64 CC.
3318/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3319/// should be AND'ed instead of OR'ed.
3321 AArch64CC::CondCode &CondCode,
3322 AArch64CC::CondCode &CondCode2) {
3323 CondCode2 = AArch64CC::AL;
3324 switch (CC) {
3325 default:
3326 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3327 assert(CondCode2 == AArch64CC::AL);
3328 break;
3329 case ISD::SETONE:
3330 // (a one b)
3331 // == ((a olt b) || (a ogt b))
3332 // == ((a ord b) && (a une b))
3333 CondCode = AArch64CC::VC;
3334 CondCode2 = AArch64CC::NE;
3335 break;
3336 case ISD::SETUEQ:
3337 // (a ueq b)
3338 // == ((a uno b) || (a oeq b))
3339 // == ((a ule b) && (a uge b))
3340 CondCode = AArch64CC::PL;
3341 CondCode2 = AArch64CC::LE;
3342 break;
3343 }
3344}
3345
3346/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3347/// CC usable with the vector instructions. Fewer operations are available
3348/// without a real NZCV register, so we have to use less efficient combinations
3349/// to get the same effect.
3351 AArch64CC::CondCode &CondCode,
3352 AArch64CC::CondCode &CondCode2,
3353 bool &Invert) {
3354 Invert = false;
3355 switch (CC) {
3356 default:
3357 // Mostly the scalar mappings work fine.
3358 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3359 break;
3360 case ISD::SETUO:
3361 Invert = true;
3362 [[fallthrough]];
3363 case ISD::SETO:
3364 CondCode = AArch64CC::MI;
3365 CondCode2 = AArch64CC::GE;
3366 break;
3367 case ISD::SETUEQ:
3368 case ISD::SETULT:
3369 case ISD::SETULE:
3370 case ISD::SETUGT:
3371 case ISD::SETUGE:
3372 // All of the compare-mask comparisons are ordered, but we can switch
3373 // between the two by a double inversion. E.g. ULE == !OGT.
3374 Invert = true;
3375 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3376 CondCode, CondCode2);
3377 break;
3378 }
3379}
3380
3382 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3383 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3384 LLVM_DEBUG(dbgs() << "Is imm " << C
3385 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3386 return IsLegal;
3387}
3388
3389// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3390// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3391// can be set differently by this operation. It comes down to whether
3392// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3393// everything is fine. If not then the optimization is wrong. Thus general
3394// comparisons are only valid if op2 != 0.
3395//
3396// So, finally, the only LLVM-native comparisons that don't mention C and V
3397// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3398// the absence of information about op2.
3400 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3401 (CC == ISD::SETEQ || CC == ISD::SETNE);
3402}
3403
3405 SelectionDAG &DAG, SDValue Chain,
3406 bool IsSignaling) {
3407 EVT VT = LHS.getValueType();
3408 assert(VT != MVT::f128);
3409
3410 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3411
3412 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3413 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3414 {Chain, LHS});
3415 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3416 {LHS.getValue(1), RHS});
3417 Chain = RHS.getValue(1);
3418 VT = MVT::f32;
3419 }
3420 unsigned Opcode =
3422 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3423}
3424
3426 const SDLoc &dl, SelectionDAG &DAG) {
3427 EVT VT = LHS.getValueType();
3428 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3429
3430 if (VT.isFloatingPoint()) {
3431 assert(VT != MVT::f128);
3432 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3433 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3434 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3435 VT = MVT::f32;
3436 }
3437 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3438 }
3439
3440 // The CMP instruction is just an alias for SUBS, and representing it as
3441 // SUBS means that it's possible to get CSE with subtract operations.
3442 // A later phase can perform the optimization of setting the destination
3443 // register to WZR/XZR if it ends up being unused.
3444 unsigned Opcode = AArch64ISD::SUBS;
3445
3446 if (isCMN(RHS, CC)) {
3447 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3448 Opcode = AArch64ISD::ADDS;
3449 RHS = RHS.getOperand(1);
3450 } else if (isCMN(LHS, CC)) {
3451 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3452 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3453 Opcode = AArch64ISD::ADDS;
3454 LHS = LHS.getOperand(1);
3455 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3456 if (LHS.getOpcode() == ISD::AND) {
3457 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3458 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3459 // of the signed comparisons.
3460 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3461 DAG.getVTList(VT, MVT_CC),
3462 LHS.getOperand(0),
3463 LHS.getOperand(1));
3464 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3465 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3466 return ANDSNode.getValue(1);
3467 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3468 // Use result of ANDS
3469 return LHS.getValue(1);
3470 }
3471 }
3472
3473 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3474 .getValue(1);
3475}
3476
3477/// \defgroup AArch64CCMP CMP;CCMP matching
3478///
3479/// These functions deal with the formation of CMP;CCMP;... sequences.
3480/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3481/// a comparison. They set the NZCV flags to a predefined value if their
3482/// predicate is false. This allows to express arbitrary conjunctions, for
3483/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3484/// expressed as:
3485/// cmp A
3486/// ccmp B, inv(CB), CA
3487/// check for CB flags
3488///
3489/// This naturally lets us implement chains of AND operations with SETCC
3490/// operands. And we can even implement some other situations by transforming
3491/// them:
3492/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3493/// negating the flags used in a CCMP/FCCMP operations.
3494/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3495/// by negating the flags we test for afterwards. i.e.
3496/// NEG (CMP CCMP CCCMP ...) can be implemented.
3497/// - Note that we can only ever negate all previously processed results.
3498/// What we can not implement by flipping the flags to test is a negation
3499/// of two sub-trees (because the negation affects all sub-trees emitted so
3500/// far, so the 2nd sub-tree we emit would also affect the first).
3501/// With those tools we can implement some OR operations:
3502/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3503/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3504/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3505/// elimination rules from earlier to implement the whole thing as a
3506/// CCMP/FCCMP chain.
3507///
3508/// As complete example:
3509/// or (or (setCA (cmp A)) (setCB (cmp B)))
3510/// (and (setCC (cmp C)) (setCD (cmp D)))"
3511/// can be reassociated to:
3512/// or (and (setCC (cmp C)) setCD (cmp D))
3513// (or (setCA (cmp A)) (setCB (cmp B)))
3514/// can be transformed to:
3515/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3516/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3517/// which can be implemented as:
3518/// cmp C
3519/// ccmp D, inv(CD), CC
3520/// ccmp A, CA, inv(CD)
3521/// ccmp B, CB, inv(CA)
3522/// check for CB flags
3523///
3524/// A counterexample is "or (and A B) (and C D)" which translates to
3525/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3526/// can only implement 1 of the inner (not) operations, but not both!
3527/// @{
3528
3529/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3531 ISD::CondCode CC, SDValue CCOp,
3532 AArch64CC::CondCode Predicate,
3533 AArch64CC::CondCode OutCC,
3534 const SDLoc &DL, SelectionDAG &DAG) {
3535 unsigned Opcode = 0;
3536 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3537
3538 if (LHS.getValueType().isFloatingPoint()) {
3539 assert(LHS.getValueType() != MVT::f128);
3540 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3541 LHS.getValueType() == MVT::bf16) {
3542 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3543 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3544 }
3545 Opcode = AArch64ISD::FCCMP;
3546 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3547 APInt Imm = Const->getAPIntValue();
3548 if (Imm.isNegative() && Imm.sgt(-32)) {
3549 Opcode = AArch64ISD::CCMN;
3550 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3551 }
3552 } else if (RHS.getOpcode() == ISD::SUB) {
3553 SDValue SubOp0 = RHS.getOperand(0);
3554 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3555 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3556 Opcode = AArch64ISD::CCMN;
3557 RHS = RHS.getOperand(1);
3558 }
3559 }
3560 if (Opcode == 0)
3561 Opcode = AArch64ISD::CCMP;
3562
3563 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3565 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3566 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3567 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3568}
3569
3570/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3571/// expressed as a conjunction. See \ref AArch64CCMP.
3572/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3573/// changing the conditions on the SETCC tests.
3574/// (this means we can call emitConjunctionRec() with
3575/// Negate==true on this sub-tree)
3576/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3577/// cannot do the negation naturally. We are required to
3578/// emit the subtree first in this case.
3579/// \param WillNegate Is true if are called when the result of this
3580/// subexpression must be negated. This happens when the
3581/// outer expression is an OR. We can use this fact to know
3582/// that we have a double negation (or (or ...) ...) that
3583/// can be implemented for free.
3584static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3585 bool &MustBeFirst, bool WillNegate,
3586 unsigned Depth = 0) {
3587 if (!Val.hasOneUse())
3588 return false;
3589 unsigned Opcode = Val->getOpcode();
3590 if (Opcode == ISD::SETCC) {
3591 if (Val->getOperand(0).getValueType() == MVT::f128)
3592 return false;
3593 CanNegate = true;
3594 MustBeFirst = false;
3595 return true;
3596 }
3597 // Protect against exponential runtime and stack overflow.
3598 if (Depth > 6)
3599 return false;
3600 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3601 bool IsOR = Opcode == ISD::OR;
3602 SDValue O0 = Val->getOperand(0);
3603 SDValue O1 = Val->getOperand(1);
3604 bool CanNegateL;
3605 bool MustBeFirstL;
3606 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3607 return false;
3608 bool CanNegateR;
3609 bool MustBeFirstR;
3610 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3611 return false;
3612
3613 if (MustBeFirstL && MustBeFirstR)
3614 return false;
3615
3616 if (IsOR) {
3617 // For an OR expression we need to be able to naturally negate at least
3618 // one side or we cannot do the transformation at all.
3619 if (!CanNegateL && !CanNegateR)
3620 return false;
3621 // If we the result of the OR will be negated and we can naturally negate
3622 // the leafs, then this sub-tree as a whole negates naturally.
3623 CanNegate = WillNegate && CanNegateL && CanNegateR;
3624 // If we cannot naturally negate the whole sub-tree, then this must be
3625 // emitted first.
3626 MustBeFirst = !CanNegate;
3627 } else {
3628 assert(Opcode == ISD::AND && "Must be OR or AND");
3629 // We cannot naturally negate an AND operation.
3630 CanNegate = false;
3631 MustBeFirst = MustBeFirstL || MustBeFirstR;
3632 }
3633 return true;
3634 }
3635 return false;
3636}
3637
3638/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3639/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3640/// Tries to transform the given i1 producing node @p Val to a series compare
3641/// and conditional compare operations. @returns an NZCV flags producing node
3642/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3643/// transformation was not possible.
3644/// \p Negate is true if we want this sub-tree being negated just by changing
3645/// SETCC conditions.
3647 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3648 AArch64CC::CondCode Predicate) {
3649 // We're at a tree leaf, produce a conditional comparison operation.
3650 unsigned Opcode = Val->getOpcode();
3651 if (Opcode == ISD::SETCC) {
3652 SDValue LHS = Val->getOperand(0);
3653 SDValue RHS = Val->getOperand(1);
3654 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3655 bool isInteger = LHS.getValueType().isInteger();
3656 if (Negate)
3657 CC = getSetCCInverse(CC, LHS.getValueType());
3658 SDLoc DL(Val);
3659 // Determine OutCC and handle FP special case.
3660 if (isInteger) {
3661 OutCC = changeIntCCToAArch64CC(CC);
3662 } else {
3663 assert(LHS.getValueType().isFloatingPoint());
3664 AArch64CC::CondCode ExtraCC;
3665 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3666 // Some floating point conditions can't be tested with a single condition
3667 // code. Construct an additional comparison in this case.
3668 if (ExtraCC != AArch64CC::AL) {
3669 SDValue ExtraCmp;
3670 if (!CCOp.getNode())
3671 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3672 else
3673 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3674 ExtraCC, DL, DAG);
3675 CCOp = ExtraCmp;
3676 Predicate = ExtraCC;
3677 }
3678 }
3679
3680 // Produce a normal comparison if we are first in the chain
3681 if (!CCOp)
3682 return emitComparison(LHS, RHS, CC, DL, DAG);
3683 // Otherwise produce a ccmp.
3684 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3685 DAG);
3686 }
3687 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3688
3689 bool IsOR = Opcode == ISD::OR;
3690
3691 SDValue LHS = Val->getOperand(0);
3692 bool CanNegateL;
3693 bool MustBeFirstL;
3694 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3695 assert(ValidL && "Valid conjunction/disjunction tree");
3696 (void)ValidL;
3697
3698 SDValue RHS = Val->getOperand(1);
3699 bool CanNegateR;
3700 bool MustBeFirstR;
3701 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3702 assert(ValidR && "Valid conjunction/disjunction tree");
3703 (void)ValidR;
3704
3705 // Swap sub-tree that must come first to the right side.
3706 if (MustBeFirstL) {
3707 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3708 std::swap(LHS, RHS);
3709 std::swap(CanNegateL, CanNegateR);
3710 std::swap(MustBeFirstL, MustBeFirstR);
3711 }
3712
3713 bool NegateR;
3714 bool NegateAfterR;
3715 bool NegateL;
3716 bool NegateAfterAll;
3717 if (Opcode == ISD::OR) {
3718 // Swap the sub-tree that we can negate naturally to the left.
3719 if (!CanNegateL) {
3720 assert(CanNegateR && "at least one side must be negatable");
3721 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3722 assert(!Negate);
3723 std::swap(LHS, RHS);
3724 NegateR = false;
3725 NegateAfterR = true;
3726 } else {
3727 // Negate the left sub-tree if possible, otherwise negate the result.
3728 NegateR = CanNegateR;
3729 NegateAfterR = !CanNegateR;
3730 }
3731 NegateL = true;
3732 NegateAfterAll = !Negate;
3733 } else {
3734 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3735 assert(!Negate && "Valid conjunction/disjunction tree");
3736
3737 NegateL = false;
3738 NegateR = false;
3739 NegateAfterR = false;
3740 NegateAfterAll = false;
3741 }
3742
3743 // Emit sub-trees.
3744 AArch64CC::CondCode RHSCC;
3745 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3746 if (NegateAfterR)
3747 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3748 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3749 if (NegateAfterAll)
3750 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3751 return CmpL;
3752}
3753
3754/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3755/// In some cases this is even possible with OR operations in the expression.
3756/// See \ref AArch64CCMP.
3757/// \see emitConjunctionRec().
3759 AArch64CC::CondCode &OutCC) {
3760 bool DummyCanNegate;
3761 bool DummyMustBeFirst;
3762 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3763 return SDValue();
3764
3765 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3766}
3767
3768/// @}
3769
3770/// Returns how profitable it is to fold a comparison's operand's shift and/or
3771/// extension operations.
3773 auto isSupportedExtend = [&](SDValue V) {
3774 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3775 return true;
3776
3777 if (V.getOpcode() == ISD::AND)
3778 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3779 uint64_t Mask = MaskCst->getZExtValue();
3780 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3781 }
3782
3783 return false;
3784 };
3785
3786 if (!Op.hasOneUse())
3787 return 0;
3788
3789 if (isSupportedExtend(Op))
3790 return 1;
3791
3792 unsigned Opc = Op.getOpcode();
3793 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3794 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3795 uint64_t Shift = ShiftCst->getZExtValue();
3796 if (isSupportedExtend(Op.getOperand(0)))
3797 return (Shift <= 4) ? 2 : 1;
3798 EVT VT = Op.getValueType();
3799 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3800 return 1;
3801 }
3802
3803 return 0;
3804}
3805
3807 SDValue &AArch64cc, SelectionDAG &DAG,
3808 const SDLoc &dl) {
3809 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3810 EVT VT = RHS.getValueType();
3811 uint64_t C = RHSC->getZExtValue();
3812 if (!isLegalArithImmed(C)) {
3813 // Constant does not fit, try adjusting it by one?
3814 switch (CC) {
3815 default:
3816 break;
3817 case ISD::SETLT:
3818 case ISD::SETGE:
3819 if ((VT == MVT::i32 && C != 0x80000000 &&
3820 isLegalArithImmed((uint32_t)(C - 1))) ||
3821 (VT == MVT::i64 && C != 0x80000000ULL &&
3822 isLegalArithImmed(C - 1ULL))) {
3824 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3825 RHS = DAG.getConstant(C, dl, VT);
3826 }
3827 break;
3828 case ISD::SETULT:
3829 case ISD::SETUGE:
3830 if ((VT == MVT::i32 && C != 0 &&
3831 isLegalArithImmed((uint32_t)(C - 1))) ||
3832 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3834 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3835 RHS = DAG.getConstant(C, dl, VT);
3836 }
3837 break;
3838 case ISD::SETLE:
3839 case ISD::SETGT:
3840 if ((VT == MVT::i32 && C != INT32_MAX &&
3841 isLegalArithImmed((uint32_t)(C + 1))) ||
3842 (VT == MVT::i64 && C != INT64_MAX &&
3843 isLegalArithImmed(C + 1ULL))) {
3845 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3846 RHS = DAG.getConstant(C, dl, VT);
3847 }
3848 break;
3849 case ISD::SETULE:
3850 case ISD::SETUGT:
3851 if ((VT == MVT::i32 && C != UINT32_MAX &&
3852 isLegalArithImmed((uint32_t)(C + 1))) ||
3853 (VT == MVT::i64 && C != UINT64_MAX &&
3854 isLegalArithImmed(C + 1ULL))) {
3856 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3857 RHS = DAG.getConstant(C, dl, VT);
3858 }
3859 break;
3860 }
3861 }
3862 }
3863
3864 // Comparisons are canonicalized so that the RHS operand is simpler than the
3865 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3866 // can fold some shift+extend operations on the RHS operand, so swap the
3867 // operands if that can be done.
3868 //
3869 // For example:
3870 // lsl w13, w11, #1
3871 // cmp w13, w12
3872 // can be turned into:
3873 // cmp w12, w11, lsl #1
3874 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3875 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3876
3878 std::swap(LHS, RHS);
3880 }
3881 }
3882
3883 SDValue Cmp;
3884 AArch64CC::CondCode AArch64CC;
3885 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3886 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3887
3888 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3889 // For the i8 operand, the largest immediate is 255, so this can be easily
3890 // encoded in the compare instruction. For the i16 operand, however, the
3891 // largest immediate cannot be encoded in the compare.
3892 // Therefore, use a sign extending load and cmn to avoid materializing the
3893 // -1 constant. For example,
3894 // movz w1, #65535
3895 // ldrh w0, [x0, #0]
3896 // cmp w0, w1
3897 // >
3898 // ldrsh w0, [x0, #0]
3899 // cmn w0, #1
3900 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3901 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3902 // ensure both the LHS and RHS are truly zero extended and to make sure the
3903 // transformation is profitable.
3904 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3905 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3906 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3907 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3908 int16_t ValueofRHS = RHS->getAsZExtVal();
3909 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3910 SDValue SExt =
3911 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3912 DAG.getValueType(MVT::i16));
3913 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3914 RHS.getValueType()),
3915 CC, dl, DAG);
3916 AArch64CC = changeIntCCToAArch64CC(CC);
3917 }
3918 }
3919
3920 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3921 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3922 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3923 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3924 }
3925 }
3926 }
3927
3928 if (!Cmp) {
3929 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3930 AArch64CC = changeIntCCToAArch64CC(CC);
3931 }
3932 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3933 return Cmp;
3934}
3935
3936static std::pair<SDValue, SDValue>
3938 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3939 "Unsupported value type");
3940 SDValue Value, Overflow;
3941 SDLoc DL(Op);
3942 SDValue LHS = Op.getOperand(0);
3943 SDValue RHS = Op.getOperand(1);
3944 unsigned Opc = 0;
3945 switch (Op.getOpcode()) {
3946 default:
3947 llvm_unreachable("Unknown overflow instruction!");
3948 case ISD::SADDO:
3949 Opc = AArch64ISD::ADDS;
3950 CC = AArch64CC::VS;
3951 break;
3952 case ISD::UADDO:
3953 Opc = AArch64ISD::ADDS;
3954 CC = AArch64CC::HS;
3955 break;
3956 case ISD::SSUBO:
3957 Opc = AArch64ISD::SUBS;
3958 CC = AArch64CC::VS;
3959 break;
3960 case ISD::USUBO:
3961 Opc = AArch64ISD::SUBS;
3962 CC = AArch64CC::LO;
3963 break;
3964 // Multiply needs a little bit extra work.
3965 case ISD::SMULO:
3966 case ISD::UMULO: {
3967 CC = AArch64CC::NE;
3968 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3969 if (Op.getValueType() == MVT::i32) {
3970 // Extend to 64-bits, then perform a 64-bit multiply.
3971 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3972 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3973 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3974 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3975 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3976
3977 // Check that the result fits into a 32-bit integer.
3978 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3979 if (IsSigned) {
3980 // cmp xreg, wreg, sxtw
3981 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3982 Overflow =
3983 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3984 } else {
3985 // tst xreg, #0xffffffff00000000
3986 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3987 Overflow =
3988 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3989 }
3990 break;
3991 }
3992 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3993 // For the 64 bit multiply
3994 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3995 if (IsSigned) {
3996 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3997 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3998 DAG.getConstant(63, DL, MVT::i64));
3999 // It is important that LowerBits is last, otherwise the arithmetic
4000 // shift will not be folded into the compare (SUBS).
4001 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4002 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4003 .getValue(1);
4004 } else {
4005 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4006 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4007 Overflow =
4008 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4009 DAG.getConstant(0, DL, MVT::i64),
4010 UpperBits).getValue(1);
4011 }
4012 break;
4013 }
4014 } // switch (...)
4015
4016 if (Opc) {
4017 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4018
4019 // Emit the AArch64 operation with overflow check.
4020 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4021 Overflow = Value.getValue(1);
4022 }
4023 return std::make_pair(Value, Overflow);
4024}
4025
4026SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4027 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4028 !Subtarget->isNeonAvailable()))
4029 return LowerToScalableOp(Op, DAG);
4030
4031 SDValue Sel = Op.getOperand(0);
4032 SDValue Other = Op.getOperand(1);
4033 SDLoc dl(Sel);
4034
4035 // If the operand is an overflow checking operation, invert the condition
4036 // code and kill the Not operation. I.e., transform:
4037 // (xor (overflow_op_bool, 1))
4038 // -->
4039 // (csel 1, 0, invert(cc), overflow_op_bool)
4040 // ... which later gets transformed to just a cset instruction with an
4041 // inverted condition code, rather than a cset + eor sequence.
4043 // Only lower legal XALUO ops.
4045 return SDValue();
4046
4047 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4048 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4050 SDValue Value, Overflow;
4051 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4052 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4053 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4054 CCVal, Overflow);
4055 }
4056 // If neither operand is a SELECT_CC, give up.
4057 if (Sel.getOpcode() != ISD::SELECT_CC)
4058 std::swap(Sel, Other);
4059 if (Sel.getOpcode() != ISD::SELECT_CC)
4060 return Op;
4061
4062 // The folding we want to perform is:
4063 // (xor x, (select_cc a, b, cc, 0, -1) )
4064 // -->
4065 // (csel x, (xor x, -1), cc ...)
4066 //
4067 // The latter will get matched to a CSINV instruction.
4068
4069 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4070 SDValue LHS = Sel.getOperand(0);
4071 SDValue RHS = Sel.getOperand(1);
4072 SDValue TVal = Sel.getOperand(2);
4073 SDValue FVal = Sel.getOperand(3);
4074
4075 // FIXME: This could be generalized to non-integer comparisons.
4076 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4077 return Op;
4078
4079 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4080 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4081
4082 // The values aren't constants, this isn't the pattern we're looking for.
4083 if (!CFVal || !CTVal)
4084 return Op;
4085
4086 // We can commute the SELECT_CC by inverting the condition. This
4087 // might be needed to make this fit into a CSINV pattern.
4088 if (CTVal->isAllOnes() && CFVal->isZero()) {
4089 std::swap(TVal, FVal);
4090 std::swap(CTVal, CFVal);
4091 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4092 }
4093
4094 // If the constants line up, perform the transform!
4095 if (CTVal->isZero() && CFVal->isAllOnes()) {
4096 SDValue CCVal;
4097 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4098
4099 FVal = Other;
4100 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4101 DAG.getConstant(-1ULL, dl, Other.getValueType()));
4102
4103 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4104 CCVal, Cmp);
4105 }
4106
4107 return Op;
4108}
4109
4110// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4111// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4112// sets 'C' bit to 0.
4114 SDLoc DL(Value);
4115 EVT VT = Value.getValueType();
4116 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4117 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4118 SDValue Cmp =
4119 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4120 return Cmp.getValue(1);
4121}
4122
4123// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4124// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4126 bool Invert) {
4127 assert(Glue.getResNo() == 1);
4128 SDLoc DL(Glue);
4129 SDValue Zero = DAG.getConstant(0, DL, VT);
4130 SDValue One = DAG.getConstant(1, DL, VT);
4131 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4132 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4133 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4134}
4135
4136// Value is 1 if 'V' bit of NZCV is 1, else 0
4138 assert(Glue.getResNo() == 1);
4139 SDLoc DL(Glue);
4140 SDValue Zero = DAG.getConstant(0, DL, VT);
4141 SDValue One = DAG.getConstant(1, DL, VT);
4142 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4143 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4144}
4145
4146// This lowering is inefficient, but it will get cleaned up by
4147// `foldOverflowCheck`
4149 unsigned Opcode, bool IsSigned) {
4150 EVT VT0 = Op.getValue(0).getValueType();
4151 EVT VT1 = Op.getValue(1).getValueType();
4152
4153 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4154 return SDValue();
4155
4156 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4157 SDValue OpLHS = Op.getOperand(0);
4158 SDValue OpRHS = Op.getOperand(1);
4159 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4160
4161 SDLoc DL(Op);
4162 SDVTList VTs = DAG.getVTList(VT0, VT1);
4163
4164 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4165 OpRHS, OpCarryIn);
4166
4167 SDValue OutFlag =
4168 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4169 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4170
4171 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4172}
4173
4175 // Let legalize expand this if it isn't a legal type yet.
4176 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4177 return SDValue();
4178
4179 SDLoc dl(Op);
4181 // The actual operation that sets the overflow or carry flag.
4182 SDValue Value, Overflow;
4183 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4184
4185 // We use 0 and 1 as false and true values.
4186 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4187 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4188
4189 // We use an inverted condition, because the conditional select is inverted
4190 // too. This will allow it to be selected to a single instruction:
4191 // CSINC Wd, WZR, WZR, invert(cond).
4192 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4193 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4194 CCVal, Overflow);
4195
4196 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4197 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4198}
4199
4200// Prefetch operands are:
4201// 1: Address to prefetch
4202// 2: bool isWrite
4203// 3: int locality (0 = no locality ... 3 = extreme locality)
4204// 4: bool isDataCache
4206 SDLoc DL(Op);
4207 unsigned IsWrite = Op.getConstantOperandVal(2);
4208 unsigned Locality = Op.getConstantOperandVal(3);
4209 unsigned IsData = Op.getConstantOperandVal(4);
4210
4211 bool IsStream = !Locality;
4212 // When the locality number is set
4213 if (Locality) {
4214 // The front-end should have filtered out the out-of-range values
4215 assert(Locality <= 3 && "Prefetch locality out-of-range");
4216 // The locality degree is the opposite of the cache speed.
4217 // Put the number the other way around.
4218 // The encoding starts at 0 for level 1
4219 Locality = 3 - Locality;
4220 }
4221
4222 // built the mask value encoding the expected behavior.
4223 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4224 (!IsData << 3) | // IsDataCache bit
4225 (Locality << 1) | // Cache level bits
4226 (unsigned)IsStream; // Stream bit
4227 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4228 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4229 Op.getOperand(1));
4230}
4231
4232SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4233 SelectionDAG &DAG) const {
4234 EVT VT = Op.getValueType();
4235 if (VT.isScalableVector())
4236 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4237
4238 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4239 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4240
4241 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4242 return SDValue();
4243}
4244
4245SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4246 SelectionDAG &DAG) const {
4247 EVT VT = Op.getValueType();
4248 if (VT.isScalableVector())
4249 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4250
4251 bool IsStrict = Op->isStrictFPOpcode();
4252 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4253 EVT SrcVT = SrcVal.getValueType();
4254 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4255
4256 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4257 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4258
4259 // Expand cases where the result type is BF16 but we don't have hardware
4260 // instructions to lower it.
4261 if (VT.getScalarType() == MVT::bf16 &&
4262 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4263 Subtarget->hasBF16())) {
4264 SDLoc dl(Op);
4265 SDValue Narrow = SrcVal;
4266 SDValue NaN;
4267 EVT I32 = SrcVT.changeElementType(MVT::i32);
4268 EVT F32 = SrcVT.changeElementType(MVT::f32);
4269 if (SrcVT.getScalarType() == MVT::f32) {
4270 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4271 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4272 if (!NeverSNaN) {
4273 // Set the quiet bit.
4274 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4275 DAG.getConstant(0x400000, dl, I32));
4276 }
4277 } else if (SrcVT.getScalarType() == MVT::f64) {
4278 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4279 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4280 } else {
4281 return SDValue();
4282 }
4283 if (!Trunc) {
4284 SDValue One = DAG.getConstant(1, dl, I32);
4285 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4286 DAG.getShiftAmountConstant(16, I32, dl));
4287 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4288 SDValue RoundingBias =
4289 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4290 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4291 }
4292
4293 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4294 // 0x80000000.
4295 if (NaN) {
4296 SDValue IsNaN = DAG.getSetCC(
4297 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4298 SrcVal, SrcVal, ISD::SETUO);
4299 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4300 }
4301
4302 // Now that we have rounded, shift the bits into position.
4303 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4304 DAG.getShiftAmountConstant(16, I32, dl));
4305 if (VT.isVector()) {
4306 EVT I16 = I32.changeVectorElementType(MVT::i16);
4307 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4308 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4309 }
4310 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4311 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4312 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4313 : Result;
4314 }
4315
4316 if (SrcVT != MVT::f128) {
4317 // Expand cases where the input is a vector bigger than NEON.
4319 return SDValue();
4320
4321 // It's legal except when f128 is involved
4322 return Op;
4323 }
4324
4325 return SDValue();
4326}
4327
4328SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4329 SelectionDAG &DAG) const {
4330 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4331 // Any additional optimization in this function should be recorded
4332 // in the cost tables.
4333 bool IsStrict = Op->isStrictFPOpcode();
4334 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4335 EVT VT = Op.getValueType();
4336
4337 if (VT.isScalableVector()) {
4338 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4341 return LowerToPredicatedOp(Op, DAG, Opcode);
4342 }
4343
4344 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4345 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4346 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4347
4348 unsigned NumElts = InVT.getVectorNumElements();
4349
4350 // f16 conversions are promoted to f32 when full fp16 is not supported.
4351 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4352 InVT.getVectorElementType() == MVT::bf16) {
4353 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4354 SDLoc dl(Op);
4355 if (IsStrict) {
4356 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4357 {Op.getOperand(0), Op.getOperand(1)});
4358 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4359 {Ext.getValue(1), Ext.getValue(0)});
4360 }
4361 return DAG.getNode(
4362 Op.getOpcode(), dl, Op.getValueType(),
4363 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4364 }
4365
4366 uint64_t VTSize = VT.getFixedSizeInBits();
4367 uint64_t InVTSize = InVT.getFixedSizeInBits();
4368 if (VTSize < InVTSize) {
4369 SDLoc dl(Op);
4370 if (IsStrict) {
4372 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4373 {Op.getOperand(0), Op.getOperand(1)});
4374 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4375 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4376 }
4377 SDValue Cv =
4378 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4379 Op.getOperand(0));
4380 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4381 }
4382
4383 if (VTSize > InVTSize) {
4384 SDLoc dl(Op);
4385 MVT ExtVT =
4388 if (IsStrict) {
4389 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4390 {Op.getOperand(0), Op.getOperand(1)});
4391 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4392 {Ext.getValue(1), Ext.getValue(0)});
4393 }
4394 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4395 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4396 }
4397
4398 // Use a scalar operation for conversions between single-element vectors of
4399 // the same size.
4400 if (NumElts == 1) {
4401 SDLoc dl(Op);
4402 SDValue Extract = DAG.getNode(
4404 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4405 EVT ScalarVT = VT.getScalarType();
4406 if (IsStrict)
4407 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4408 {Op.getOperand(0), Extract});
4409 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4410 }
4411
4412 // Type changing conversions are illegal.
4413 return Op;
4414}
4415
4416SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4417 SelectionDAG &DAG) const {
4418 bool IsStrict = Op->isStrictFPOpcode();
4419 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4420
4421 if (SrcVal.getValueType().isVector())
4422 return LowerVectorFP_TO_INT(Op, DAG);
4423
4424 // f16 conversions are promoted to f32 when full fp16 is not supported.
4425 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4426 SrcVal.getValueType() == MVT::bf16) {
4427 SDLoc dl(Op);
4428 if (IsStrict) {
4429 SDValue Ext =
4430 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4431 {Op.getOperand(0), SrcVal});
4432 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4433 {Ext.getValue(1), Ext.getValue(0)});
4434 }
4435 return DAG.getNode(
4436 Op.getOpcode(), dl, Op.getValueType(),
4437 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4438 }
4439
4440 if (SrcVal.getValueType() != MVT::f128) {
4441 // It's legal except when f128 is involved
4442 return Op;
4443 }
4444
4445 return SDValue();
4446}
4447
4448SDValue
4449AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4450 SelectionDAG &DAG) const {
4451 // AArch64 FP-to-int conversions saturate to the destination element size, so
4452 // we can lower common saturating conversions to simple instructions.
4453 SDValue SrcVal = Op.getOperand(0);
4454 EVT SrcVT = SrcVal.getValueType();
4455 EVT DstVT = Op.getValueType();
4456 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4457
4458 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4459 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4460 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4461 assert(SatWidth <= DstElementWidth &&
4462 "Saturation width cannot exceed result width");
4463
4464 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4465 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4466 // types, so this is hard to reach.
4467 if (DstVT.isScalableVector())
4468 return SDValue();
4469
4470 EVT SrcElementVT = SrcVT.getVectorElementType();
4471
4472 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4473 if ((SrcElementVT == MVT::f16 &&
4474 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4475 SrcElementVT == MVT::bf16) {
4476 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4477 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4478 SrcVT = F32VT;
4479 SrcElementVT = MVT::f32;
4480 SrcElementWidth = 32;
4481 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4482 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4483 return SDValue();
4484
4485 SDLoc DL(Op);
4486 // Expand to f64 if we are saturating to i64, to help produce keep the lanes
4487 // the same width and produce a fcvtzu.
4488 if (SatWidth == 64 && SrcElementWidth < 64) {
4489 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4490 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4491 SrcVT = F64VT;
4492 SrcElementVT = MVT::f64;
4493 SrcElementWidth = 64;
4494 }
4495 // Cases that we can emit directly.
4496 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4497 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4498 DAG.getValueType(DstVT.getScalarType()));
4499
4500 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4501 // result. This is only valid if the legal cvt is larger than the saturate
4502 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4503 // (at least until sqxtn is selected).
4504 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4505 return SDValue();
4506
4507 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4508 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4509 DAG.getValueType(IntVT.getScalarType()));
4510 SDValue Sat;
4511 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4512 SDValue MinC = DAG.getConstant(
4513 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4514 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4515 SDValue MaxC = DAG.getConstant(
4516 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4517 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4518 } else {
4519 SDValue MinC = DAG.getConstant(
4520 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4521 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4522 }
4523
4524 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4525}
4526
4527SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4528 SelectionDAG &DAG) const {
4529 // AArch64 FP-to-int conversions saturate to the destination register size, so
4530 // we can lower common saturating conversions to simple instructions.
4531 SDValue SrcVal = Op.getOperand(0);
4532 EVT SrcVT = SrcVal.getValueType();
4533
4534 if (SrcVT.isVector())
4535 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4536
4537 EVT DstVT = Op.getValueType();
4538 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4539 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4540 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4541 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4542
4543 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4544 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4545 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4546 SrcVT = MVT::f32;
4547 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4548 SrcVT != MVT::bf16)
4549 return SDValue();
4550
4551 SDLoc DL(Op);
4552 // Cases that we can emit directly.
4553 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4554 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4555 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4556 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4557 DAG.getValueType(DstVT));
4558
4559 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4560 // result. This is only valid if the legal cvt is larger than the saturate
4561 // width.
4562 if (DstWidth < SatWidth)
4563 return SDValue();
4564
4565 SDValue NativeCvt =
4566 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4567 SDValue Sat;
4568 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4569 SDValue MinC = DAG.getConstant(
4570 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4571 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4572 SDValue MaxC = DAG.getConstant(
4573 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4574 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4575 } else {
4576 SDValue MinC = DAG.getConstant(
4577 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4578 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4579 }
4580
4581 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4582}
4583
4584SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4585 SelectionDAG &DAG) const {
4586 EVT VT = Op.getValueType();
4587 SDValue Src = Op.getOperand(0);
4588 SDLoc DL(Op);
4589
4590 assert(VT.isVector() && "Expected vector type");
4591
4592 EVT CastVT =
4593 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4594
4595 // Round the floating-point value into a floating-point register with the
4596 // current rounding mode.
4597 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4598
4599 // Truncate the rounded floating point to an integer.
4600 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4602}
4603
4604SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4605 SelectionDAG &DAG) const {
4606 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4607 // Any additional optimization in this function should be recorded
4608 // in the cost tables.
4609 bool IsStrict = Op->isStrictFPOpcode();
4610 EVT VT = Op.getValueType();
4611 SDLoc dl(Op);
4612 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4613 EVT InVT = In.getValueType();
4614 unsigned Opc = Op.getOpcode();
4615 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4616
4617 if (VT.isScalableVector()) {
4618 if (InVT.getVectorElementType() == MVT::i1) {
4619 // We can't directly extend an SVE predicate; extend it first.
4620 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4621 EVT CastVT = getPromotedVTForPredicate(InVT);
4622 In = DAG.getNode(CastOpc, dl, CastVT, In);
4623 return DAG.getNode(Opc, dl, VT, In);
4624 }
4625
4626 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4628 return LowerToPredicatedOp(Op, DAG, Opcode);
4629 }
4630
4631 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4632 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4633 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4634
4635 // Promote bf16 conversions to f32.
4636 if (VT.getVectorElementType() == MVT::bf16) {
4637 EVT F32 = VT.changeElementType(MVT::f32);
4638 if (IsStrict) {
4639 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4640 {Op.getOperand(0), In});
4641 return DAG.getNode(
4642 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4643 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4644 }
4645 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4646 DAG.getNode(Op.getOpcode(), dl, F32, In),
4647 DAG.getIntPtrConstant(0, dl));
4648 }
4649
4650 uint64_t VTSize = VT.getFixedSizeInBits();
4651 uint64_t InVTSize = InVT.getFixedSizeInBits();
4652 if (VTSize < InVTSize) {
4653 MVT CastVT =
4655 InVT.getVectorNumElements());
4656 if (IsStrict) {
4657 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4658 {Op.getOperand(0), In});
4659 return DAG.getNode(
4660 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4661 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4662 }
4663 In = DAG.getNode(Opc, dl, CastVT, In);
4664 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4665 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4666 }
4667
4668 if (VTSize > InVTSize) {
4669 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4671 In = DAG.getNode(CastOpc, dl, CastVT, In);
4672 if (IsStrict)
4673 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4674 return DAG.getNode(Opc, dl, VT, In);
4675 }
4676
4677 // Use a scalar operation for conversions between single-element vectors of
4678 // the same size.
4679 if (VT.getVectorNumElements() == 1) {
4680 SDValue Extract = DAG.getNode(
4682 In, DAG.getConstant(0, dl, MVT::i64));
4683 EVT ScalarVT = VT.getScalarType();
4684 if (IsStrict)
4685 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4686 {Op.getOperand(0), Extract});
4687 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4688 }
4689
4690 return Op;
4691}
4692
4693SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4694 SelectionDAG &DAG) const {
4695 if (Op.getValueType().isVector())
4696 return LowerVectorINT_TO_FP(Op, DAG);
4697
4698 bool IsStrict = Op->isStrictFPOpcode();
4699 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4700
4701 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4702 Op->getOpcode() == ISD::SINT_TO_FP;
4703
4704 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4705 SDLoc dl(Op);
4706 if (IsStrict) {
4707 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4708 {Op.getOperand(0), SrcVal});
4709 return DAG.getNode(
4710 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4711 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4712 }
4713 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4714 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4715 DAG.getIntPtrConstant(0, dl));
4716 };
4717
4718 if (Op.getValueType() == MVT::bf16) {
4719 unsigned MaxWidth = IsSigned
4720 ? DAG.ComputeMaxSignificantBits(SrcVal)
4721 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4722 // bf16 conversions are promoted to f32 when converting from i16.
4723 if (MaxWidth <= 24) {
4724 return IntToFpViaPromotion(MVT::f32);
4725 }
4726
4727 // bf16 conversions are promoted to f64 when converting from i32.
4728 if (MaxWidth <= 53) {
4729 return IntToFpViaPromotion(MVT::f64);
4730 }
4731
4732 // We need to be careful about i64 -> bf16.
4733 // Consider an i32 22216703.
4734 // This number cannot be represented exactly as an f32 and so a itofp will
4735 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4736 // However, the correct bf16 was supposed to be 22151168.0
4737 // We need to use sticky rounding to get this correct.
4738 if (SrcVal.getValueType() == MVT::i64) {
4739 SDLoc DL(Op);
4740 // This algorithm is equivalent to the following:
4741 // uint64_t SrcHi = SrcVal & ~0xfffull;
4742 // uint64_t SrcLo = SrcVal & 0xfffull;
4743 // uint64_t Highest = SrcVal >> 53;
4744 // bool HasHighest = Highest != 0;
4745 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4746 // double Rounded = static_cast<double>(ToRound);
4747 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4748 // uint64_t HasLo = SrcLo != 0;
4749 // bool NeedsAdjustment = HasHighest & HasLo;
4750 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4751 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4752 // return static_cast<__bf16>(Adjusted);
4753 //
4754 // Essentially, what happens is that SrcVal either fits perfectly in a
4755 // double-precision value or it is too big. If it is sufficiently small,
4756 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4757 // ensure that u64 -> double has no rounding error by only using the 52
4758 // MSB of the input. The low order bits will get merged into a sticky bit
4759 // which will avoid issues incurred by double rounding.
4760
4761 // Signed conversion is more or less like so:
4762 // copysign((__bf16)abs(SrcVal), SrcVal)
4763 SDValue SignBit;
4764 if (IsSigned) {
4765 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4766 DAG.getConstant(1ull << 63, DL, MVT::i64));
4767 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4768 }
4769 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4770 DAG.getConstant(~0xfffull, DL, MVT::i64));
4771 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4772 DAG.getConstant(0xfffull, DL, MVT::i64));
4774 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4775 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4776 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4777 SDValue ToRound =
4778 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4779 SDValue Rounded =
4780 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4781 {Op.getOperand(0), ToRound})
4782 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4783
4784 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4785 if (SignBit) {
4786 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4787 }
4788
4789 SDValue HasHighest = DAG.getSetCC(
4790 DL,
4791 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4792 Highest, Zero64, ISD::SETNE);
4793
4794 SDValue HasLo = DAG.getSetCC(
4795 DL,
4796 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4797 SrcLo, Zero64, ISD::SETNE);
4798
4799 SDValue NeedsAdjustment =
4800 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4801 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4802
4803 SDValue AdjustedBits =
4804 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4805 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4806 return IsStrict
4808 {Op.getValueType(), MVT::Other},
4809 {Rounded.getValue(1), Adjusted,
4810 DAG.getIntPtrConstant(0, DL)})
4811 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4812 DAG.getIntPtrConstant(0, DL, true));
4813 }
4814 }
4815
4816 // f16 conversions are promoted to f32 when full fp16 is not supported.
4817 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4818 return IntToFpViaPromotion(MVT::f32);
4819 }
4820
4821 // i128 conversions are libcalls.
4822 if (SrcVal.getValueType() == MVT::i128)
4823 return SDValue();
4824
4825 // Other conversions are legal, unless it's to the completely software-based
4826 // fp128.
4827 if (Op.getValueType() != MVT::f128)
4828 return Op;
4829 return SDValue();
4830}
4831
4832SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4833 SelectionDAG &DAG) const {
4834 // For iOS, we want to call an alternative entry point: __sincos_stret,
4835 // which returns the values in two S / D registers.
4836 SDLoc dl(Op);
4837 SDValue Arg = Op.getOperand(0);
4838 EVT ArgVT = Arg.getValueType();
4839 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4840
4842 ArgListEntry Entry;
4843
4844 Entry.Node = Arg;
4845 Entry.Ty = ArgTy;
4846 Entry.IsSExt = false;
4847 Entry.IsZExt = false;
4848 Args.push_back(Entry);
4849
4850 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4851 : RTLIB::SINCOS_STRET_F32;
4852 const char *LibcallName = getLibcallName(LC);
4853 SDValue Callee =
4854 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4855
4856 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4858 CLI.setDebugLoc(dl)
4859 .setChain(DAG.getEntryNode())
4860 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4861
4862 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4863 return CallResult.first;
4864}
4865
4866static MVT getSVEContainerType(EVT ContentTy);
4867
4868SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4869 SelectionDAG &DAG) const {
4870 EVT OpVT = Op.getValueType();
4871 EVT ArgVT = Op.getOperand(0).getValueType();
4872
4874 return LowerFixedLengthBitcastToSVE(Op, DAG);
4875
4876 if (OpVT.isScalableVector()) {
4877 // Bitcasting between unpacked vector types of different element counts is
4878 // not a NOP because the live elements are laid out differently.
4879 // 01234567
4880 // e.g. nxv2i32 = XX??XX??
4881 // nxv4f16 = X?X?X?X?
4882 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4883 return SDValue();
4884
4885 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4886 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4887 "Expected int->fp bitcast!");
4888 SDValue ExtResult =
4890 Op.getOperand(0));
4891 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4892 }
4893 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4894 }
4895
4896 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4897 return SDValue();
4898
4899 // Bitcasts between f16 and bf16 are legal.
4900 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4901 return Op;
4902
4903 assert(ArgVT == MVT::i16);
4904 SDLoc DL(Op);
4905
4906 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4907 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4908 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4909}
4910
4911static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4912 if (OrigVT.getSizeInBits() >= 64)
4913 return OrigVT;
4914
4915 assert(OrigVT.isSimple() && "Expecting a simple value type");
4916
4917 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4918 switch (OrigSimpleTy) {
4919 default: llvm_unreachable("Unexpected Vector Type");
4920 case MVT::v2i8:
4921 case MVT::v2i16:
4922 return MVT::v2i32;
4923 case MVT::v4i8:
4924 return MVT::v4i16;
4925 }
4926}
4927
4929 const EVT &OrigTy,
4930 const EVT &ExtTy,
4931 unsigned ExtOpcode) {
4932 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4933 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4934 // 64-bits we need to insert a new extension so that it will be 64-bits.
4935 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4936 if (OrigTy.getSizeInBits() >= 64)
4937 return N;
4938
4939 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4940 EVT NewVT = getExtensionTo64Bits(OrigTy);
4941
4942 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4943}
4944
4945// Returns lane if Op extracts from a two-element vector and lane is constant
4946// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4947static std::optional<uint64_t>
4949 SDNode *OpNode = Op.getNode();
4950 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4951 return std::nullopt;
4952
4953 EVT VT = OpNode->getOperand(0).getValueType();
4954 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4955 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4956 return std::nullopt;
4957
4958 return C->getZExtValue();
4959}
4960
4962 bool isSigned) {
4963 EVT VT = N.getValueType();
4964
4965 if (N.getOpcode() != ISD::BUILD_VECTOR)
4966 return false;
4967
4968 for (const SDValue &Elt : N->op_values()) {
4969 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4970 unsigned EltSize = VT.getScalarSizeInBits();
4971 unsigned HalfSize = EltSize / 2;
4972 if (isSigned) {
4973 if (!isIntN(HalfSize, C->getSExtValue()))
4974 return false;
4975 } else {
4976 if (!isUIntN(HalfSize, C->getZExtValue()))
4977 return false;
4978 }
4979 continue;
4980 }
4981 return false;
4982 }
4983
4984 return true;
4985}
4986
4988 EVT VT = N.getValueType();
4989 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4990
4991 unsigned NumElts = VT.getVectorNumElements();
4992 unsigned OrigEltSize = VT.getScalarSizeInBits();
4993 unsigned EltSize = OrigEltSize / 2;
4994 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4995
4996 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4997 if (DAG.MaskedValueIsZero(N, HiBits))
4998 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4999
5000 if (ISD::isExtOpcode(N.getOpcode()))
5001 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
5002 N.getOperand(0).getValueType(), VT,
5003 N.getOpcode());
5004
5005 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5006 SDLoc dl(N);
5008 for (unsigned i = 0; i != NumElts; ++i) {
5009 const APInt &CInt = N.getConstantOperandAPInt(i);
5010 // Element types smaller than 32 bits are not legal, so use i32 elements.
5011 // The values are implicitly truncated so sext vs. zext doesn't matter.
5012 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
5013 }
5014 return DAG.getBuildVector(TruncVT, dl, Ops);
5015}
5016
5018 return N.getOpcode() == ISD::SIGN_EXTEND ||
5019 N.getOpcode() == ISD::ANY_EXTEND ||
5020 isExtendedBUILD_VECTOR(N, DAG, true);
5021}
5022
5024 return N.getOpcode() == ISD::ZERO_EXTEND ||
5025 N.getOpcode() == ISD::ANY_EXTEND ||
5026 isExtendedBUILD_VECTOR(N, DAG, false);
5027}
5028
5030 unsigned Opcode = N.getOpcode();
5031 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5032 SDValue N0 = N.getOperand(0);
5033 SDValue N1 = N.getOperand(1);
5034 return N0->hasOneUse() && N1->hasOneUse() &&
5035 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5036 }
5037 return false;
5038}
5039
5041 unsigned Opcode = N.getOpcode();
5042 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5043 SDValue N0 = N.getOperand(0);
5044 SDValue N1 = N.getOperand(1);
5045 return N0->hasOneUse() && N1->hasOneUse() &&
5046 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5047 }
5048 return false;
5049}
5050
5051SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5052 SelectionDAG &DAG) const {
5053 // The rounding mode is in bits 23:22 of the FPSCR.
5054 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5055 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5056 // so that the shift + and get folded into a bitfield extract.
5057 SDLoc dl(Op);
5058
5059 SDValue Chain = Op.getOperand(0);
5060 SDValue FPCR_64 = DAG.getNode(
5061 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5062 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5063 Chain = FPCR_64.getValue(1);
5064 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5065 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5066 DAG.getConstant(1U << 22, dl, MVT::i32));
5067 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5068 DAG.getConstant(22, dl, MVT::i32));
5069 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5070 DAG.getConstant(3, dl, MVT::i32));
5071 return DAG.getMergeValues({AND, Chain}, dl);
5072}
5073
5074SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5075 SelectionDAG &DAG) const {
5076 SDLoc DL(Op);
5077 SDValue Chain = Op->getOperand(0);
5078 SDValue RMValue = Op->getOperand(1);
5079
5080 // The rounding mode is in bits 23:22 of the FPCR.
5081 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5082 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5083 // ((arg - 1) & 3) << 22).
5084 //
5085 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5086 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5087 // generated llvm.set.rounding to ensure this condition.
5088
5089 // Calculate new value of FPCR[23:22].
5090 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5091 DAG.getConstant(1, DL, MVT::i32));
5092 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5093 DAG.getConstant(0x3, DL, MVT::i32));
5094 RMValue =
5095 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5096 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5097 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5098
5099 // Get current value of FPCR.
5100 SDValue Ops[] = {
5101 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5102 SDValue FPCR =
5103 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5104 Chain = FPCR.getValue(1);
5105 FPCR = FPCR.getValue(0);
5106
5107 // Put new rounding mode into FPSCR[23:22].
5108 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5109 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5110 DAG.getConstant(RMMask, DL, MVT::i64));
5111 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5112 SDValue Ops2[] = {
5113 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5114 FPCR};
5115 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5116}
5117
5118SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5119 SelectionDAG &DAG) const {
5120 SDLoc DL(Op);
5121 SDValue Chain = Op->getOperand(0);
5122
5123 // Get current value of FPCR.
5124 SDValue Ops[] = {
5125 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5126 SDValue FPCR =
5127 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5128 Chain = FPCR.getValue(1);
5129 FPCR = FPCR.getValue(0);
5130
5131 // Truncate FPCR to 32 bits.
5132 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5133
5134 return DAG.getMergeValues({Result, Chain}, DL);
5135}
5136
5137SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5138 SelectionDAG &DAG) const {
5139 SDLoc DL(Op);
5140 SDValue Chain = Op->getOperand(0);
5141 SDValue Mode = Op->getOperand(1);
5142
5143 // Extend the specified value to 64 bits.
5144 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5145
5146 // Set new value of FPCR.
5147 SDValue Ops2[] = {
5148 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5149 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5150}
5151
5152SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5153 SelectionDAG &DAG) const {
5154 SDLoc DL(Op);
5155 SDValue Chain = Op->getOperand(0);
5156
5157 // Get current value of FPCR.
5158 SDValue Ops[] = {
5159 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5160 SDValue FPCR =
5161 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5162 Chain = FPCR.getValue(1);
5163 FPCR = FPCR.getValue(0);
5164
5165 // Clear bits that are not reserved.
5166 SDValue FPSCRMasked = DAG.getNode(
5167 ISD::AND, DL, MVT::i64, FPCR,
5169
5170 // Set new value of FPCR.
5171 SDValue Ops2[] = {Chain,
5172 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5173 FPSCRMasked};
5174 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5175}
5176
5177static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5178 SDLoc DL, bool &IsMLA) {
5179 bool IsN0SExt = isSignExtended(N0, DAG);
5180 bool IsN1SExt = isSignExtended(N1, DAG);
5181 if (IsN0SExt && IsN1SExt)
5182 return AArch64ISD::SMULL;
5183
5184 bool IsN0ZExt = isZeroExtended(N0, DAG);
5185 bool IsN1ZExt = isZeroExtended(N1, DAG);
5186
5187 if (IsN0ZExt && IsN1ZExt)
5188 return AArch64ISD::UMULL;
5189
5190 // Select SMULL if we can replace zext with sext.
5191 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
5192 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
5193 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
5194 SDValue ZextOperand;
5195 if (IsN0ZExt)
5196 ZextOperand = N0.getOperand(0);
5197 else
5198 ZextOperand = N1.getOperand(0);
5199 if (DAG.SignBitIsZero(ZextOperand)) {
5200 SDValue NewSext =
5201 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
5202 if (IsN0ZExt)
5203 N0 = NewSext;
5204 else
5205 N1 = NewSext;
5206 return AArch64ISD::SMULL;
5207 }
5208 }
5209
5210 // Select UMULL if we can replace the other operand with an extend.
5211 if (IsN0ZExt || IsN1ZExt) {
5212 EVT VT = N0.getValueType();
5214 VT.getScalarSizeInBits() / 2);
5215 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5216 return AArch64ISD::UMULL;
5217 }
5218
5219 if (!IsN1SExt && !IsN1ZExt)
5220 return 0;
5221
5222 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5223 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5224 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5225 IsMLA = true;
5226 return AArch64ISD::SMULL;
5227 }
5228 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5229 IsMLA = true;
5230 return AArch64ISD::UMULL;
5231 }
5232 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5233 std::swap(N0, N1);
5234 IsMLA = true;
5235 return AArch64ISD::UMULL;
5236 }
5237 return 0;
5238}
5239
5240SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5241 EVT VT = Op.getValueType();
5242
5243 bool OverrideNEON = !Subtarget->isNeonAvailable();
5244 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5245 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5246
5247 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5248 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5249 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5250 "unexpected type for custom-lowering ISD::MUL");
5251 SDValue N0 = Op.getOperand(0);
5252 SDValue N1 = Op.getOperand(1);
5253 bool isMLA = false;
5254 EVT OVT = VT;
5255 if (VT.is64BitVector()) {
5256 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5257 isNullConstant(N0.getOperand(1)) &&
5259 isNullConstant(N1.getOperand(1))) {
5260 N0 = N0.getOperand(0);
5261 N1 = N1.getOperand(0);
5262 VT = N0.getValueType();
5263 } else {
5264 if (VT == MVT::v1i64) {
5265 if (Subtarget->hasSVE())
5266 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5267 // Fall through to expand this. It is not legal.
5268 return SDValue();
5269 } else
5270 // Other vector multiplications are legal.
5271 return Op;
5272 }
5273 }
5274
5275 SDLoc DL(Op);
5276 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5277
5278 if (!NewOpc) {
5279 if (VT.getVectorElementType() == MVT::i64) {
5280 // If SVE is available then i64 vector multiplications can also be made
5281 // legal.
5282 if (Subtarget->hasSVE())
5283 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5284 // Fall through to expand this. It is not legal.
5285 return SDValue();
5286 } else
5287 // Other vector multiplications are legal.
5288 return Op;
5289 }
5290
5291 // Legalize to a S/UMULL instruction
5292 SDValue Op0;
5293 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5294 if (!isMLA) {
5295 Op0 = skipExtensionForVectorMULL(N0, DAG);
5297 Op1.getValueType().is64BitVector() &&
5298 "unexpected types for extended operands to VMULL");
5299 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5300 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5301 DAG.getConstant(0, DL, MVT::i64));
5302 }
5303 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5304 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5305 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5308 EVT Op1VT = Op1.getValueType();
5309 return DAG.getNode(
5311 DAG.getNode(N0.getOpcode(), DL, VT,
5312 DAG.getNode(NewOpc, DL, VT,
5313 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5314 DAG.getNode(NewOpc, DL, VT,
5315 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5316 DAG.getConstant(0, DL, MVT::i64));
5317}
5318
5319static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5320 int Pattern) {
5321 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5322 return DAG.getConstant(1, DL, MVT::nxv1i1);
5323 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5324 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5325}
5326
5328 bool IsSigned, bool IsEqual) {
5329 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5330 !isa<ConstantSDNode>(Op.getOperand(2)))
5331 return SDValue();
5332
5333 SDLoc dl(Op);
5334 APInt X = Op.getConstantOperandAPInt(1);
5335 APInt Y = Op.getConstantOperandAPInt(2);
5336 bool Overflow;
5337 APInt NumActiveElems =
5338 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5339
5340 if (Overflow)
5341 return SDValue();
5342
5343 if (IsEqual) {
5344 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5345 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5346 : NumActiveElems.uadd_ov(One, Overflow);
5347 if (Overflow)
5348 return SDValue();
5349 }
5350
5351 std::optional<unsigned> PredPattern =
5353 unsigned MinSVEVectorSize = std::max(
5355 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5356 if (PredPattern != std::nullopt &&
5357 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5358 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5359
5360 return SDValue();
5361}
5362
5363// Returns a safe bitcast between two scalable vector predicates, where
5364// any newly created lanes from a widening bitcast are defined as zero.
5366 SDLoc DL(Op);
5367 EVT InVT = Op.getValueType();
5368
5369 assert(InVT.getVectorElementType() == MVT::i1 &&
5370 VT.getVectorElementType() == MVT::i1 &&
5371 "Expected a predicate-to-predicate bitcast");
5373 InVT.isScalableVector() &&
5374 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5375 "Only expect to cast between legal scalable predicate types!");
5376
5377 // Return the operand if the cast isn't changing type,
5378 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5379 if (InVT == VT)
5380 return Op;
5381
5382 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5383
5384 // We only have to zero the lanes if new lanes are being defined, e.g. when
5385 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5386 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5387 // we can return here.
5388 if (InVT.bitsGT(VT))
5389 return Reinterpret;
5390
5391 // Check if the other lanes are already known to be zeroed by
5392 // construction.
5394 return Reinterpret;
5395
5396 // Zero the newly introduced lanes.
5397 SDValue Mask = DAG.getConstant(1, DL, InVT);
5398 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5399 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5400}
5401
5402SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5403 SDValue Chain, SDLoc DL,
5404 EVT VT) const {
5405 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5407 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5408 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5411 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5413 RetTy, Callee, std::move(Args));
5414 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5415 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5416 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5417 Mask);
5418}
5419
5420// Lower an SME LDR/STR ZA intrinsic
5421// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5422// folded into the instruction
5423// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5424// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5425// and tile slice registers
5426// ldr(%tileslice, %ptr, %vecnum)
5427// ->
5428// %svl = rdsvl
5429// %ptr2 = %ptr + %svl * %vecnum
5430// %tileslice2 = %tileslice + %vecnum
5431// ldr [%tileslice2, 0], [%ptr2, 0]
5432// Case 3: If the vecnum is an immediate out of range, then the same is done as
5433// case 2, but the base and slice registers are modified by the greatest
5434// multiple of 15 lower than the vecnum and the remainder is folded into the
5435// instruction. This means that successive loads and stores that are offset from
5436// each other can share the same base and slice register updates.
5437// ldr(%tileslice, %ptr, 22)
5438// ldr(%tileslice, %ptr, 23)
5439// ->
5440// %svl = rdsvl
5441// %ptr2 = %ptr + %svl * 15
5442// %tileslice2 = %tileslice + 15
5443// ldr [%tileslice2, 7], [%ptr2, 7]
5444// ldr [%tileslice2, 8], [%ptr2, 8]
5445// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5446// operand and the immediate can be folded into the instruction, like case 2.
5447// ldr(%tileslice, %ptr, %vecnum + 7)
5448// ldr(%tileslice, %ptr, %vecnum + 8)
5449// ->
5450// %svl = rdsvl
5451// %ptr2 = %ptr + %svl * %vecnum
5452// %tileslice2 = %tileslice + %vecnum
5453// ldr [%tileslice2, 7], [%ptr2, 7]
5454// ldr [%tileslice2, 8], [%ptr2, 8]
5455// Case 5: The vecnum being an add of an immediate out of range is also handled,
5456// in which case the same remainder logic as case 3 is used.
5458 SDLoc DL(N);
5459
5460 SDValue TileSlice = N->getOperand(2);
5461 SDValue Base = N->getOperand(3);
5462 SDValue VecNum = N->getOperand(4);
5463 int32_t ConstAddend = 0;
5464 SDValue VarAddend = VecNum;
5465
5466 // If the vnum is an add of an immediate, we can fold it into the instruction
5467 if (VecNum.getOpcode() == ISD::ADD &&
5468 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5469 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5470 VarAddend = VecNum.getOperand(0);
5471 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5472 ConstAddend = ImmNode->getSExtValue();
5473 VarAddend = SDValue();
5474 }
5475
5476 int32_t ImmAddend = ConstAddend % 16;
5477 if (int32_t C = (ConstAddend - ImmAddend)) {
5478 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5479 VarAddend = VarAddend
5480 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5481 : CVal;
5482 }
5483
5484 if (VarAddend) {
5485 // Get the vector length that will be multiplied by vnum
5486 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5487 DAG.getConstant(1, DL, MVT::i32));
5488
5489 // Multiply SVL and vnum then add it to the base
5490 SDValue Mul = DAG.getNode(
5491 ISD::MUL, DL, MVT::i64,
5492 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5493 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5494 // Just add vnum to the tileslice
5495 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5496 }
5497
5499 DL, MVT::Other,
5500 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5501 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5502}
5503
5504SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5505 SelectionDAG &DAG) const {
5506 unsigned IntNo = Op.getConstantOperandVal(1);
5507 SDLoc DL(Op);
5508 switch (IntNo) {
5509 default:
5510 return SDValue(); // Don't custom lower most intrinsics.
5511 case Intrinsic::aarch64_prefetch: {
5512 SDValue Chain = Op.getOperand(0);
5513 SDValue Addr = Op.getOperand(2);
5514
5515 unsigned IsWrite = Op.getConstantOperandVal(3);
5516 unsigned Locality = Op.getConstantOperandVal(4);
5517 unsigned IsStream = Op.getConstantOperandVal(5);
5518 unsigned IsData = Op.getConstantOperandVal(6);
5519 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5520 (!IsData << 3) | // IsDataCache bit
5521 (Locality << 1) | // Cache level bits
5522 (unsigned)IsStream; // Stream bit
5523
5524 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5525 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5526 }
5527 case Intrinsic::aarch64_sme_str:
5528 case Intrinsic::aarch64_sme_ldr: {
5529 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5530 }
5531 case Intrinsic::aarch64_sme_za_enable:
5532 return DAG.getNode(
5533 AArch64ISD::SMSTART, DL, MVT::Other,
5534 Op->getOperand(0), // Chain
5535 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5536 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5537 case Intrinsic::aarch64_sme_za_disable:
5538 return DAG.getNode(
5539 AArch64ISD::SMSTOP, DL, MVT::Other,
5540 Op->getOperand(0), // Chain
5541 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5542 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5543 }
5544}
5545
5546SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5547 SelectionDAG &DAG) const {
5548 unsigned IntNo = Op.getConstantOperandVal(1);
5549 SDLoc DL(Op);
5550 switch (IntNo) {
5551 default:
5552 return SDValue(); // Don't custom lower most intrinsics.
5553 case Intrinsic::aarch64_mops_memset_tag: {
5554 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5555 SDValue Chain = Node->getChain();
5556 SDValue Dst = Op.getOperand(2);
5557 SDValue Val = Op.getOperand(3);
5558 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5559 SDValue Size = Op.getOperand(4);
5560 auto Alignment = Node->getMemOperand()->getAlign();
5561 bool IsVol = Node->isVolatile();
5562 auto DstPtrInfo = Node->getPointerInfo();
5563
5564 const auto &SDI =
5565 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5566 SDValue MS =
5567 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5568 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5569
5570 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5571 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5572 // LowerOperationWrapper will complain that the number of results has
5573 // changed.
5574 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5575 }
5576 }
5577}
5578
5579SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5580 SelectionDAG &DAG) const {
5581 unsigned IntNo = Op.getConstantOperandVal(0);
5582 SDLoc dl(Op);
5583 switch (IntNo) {
5584 default: return SDValue(); // Don't custom lower most intrinsics.
5585 case Intrinsic::thread_pointer: {
5586 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5587 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5588 }
5589 case Intrinsic::aarch64_neon_abs: {
5590 EVT Ty = Op.getValueType();
5591 if (Ty == MVT::i64) {
5592 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5593 Op.getOperand(1));
5594 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5595 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5596 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5597 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5598 } else {
5599 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5600 }
5601 }
5602 case Intrinsic::aarch64_neon_pmull64: {
5603 SDValue LHS = Op.getOperand(1);
5604 SDValue RHS = Op.getOperand(2);
5605
5606 std::optional<uint64_t> LHSLane =
5608 std::optional<uint64_t> RHSLane =
5610
5611 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5612 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5613
5614 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5615 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5616 // which ISel recognizes better. For example, generate a ldr into d*
5617 // registers as opposed to a GPR load followed by a fmov.
5618 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5619 std::optional<uint64_t> OtherLane,
5620 const SDLoc &dl,
5621 SelectionDAG &DAG) -> SDValue {
5622 // If the operand is an higher half itself, rewrite it to
5623 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5624 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5625 if (NLane && *NLane == 1)
5626 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5627 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5628
5629 // Operand N is not a higher half but the other operand is.
5630 if (OtherLane && *OtherLane == 1) {
5631 // If this operand is a lower half, rewrite it to
5632 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5633 // align lanes of two operands. A roundtrip sequence (to move from lane
5634 // 1 to lane 0) is like this:
5635 // mov x8, v0.d[1]
5636 // fmov d0, x8
5637 if (NLane && *NLane == 0)
5638 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5639 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5640 N.getOperand(0),
5641 DAG.getConstant(0, dl, MVT::i64)),
5642 DAG.getConstant(1, dl, MVT::i64));
5643
5644 // Otherwise just dup from main to all lanes.
5645 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5646 }
5647
5648 // Neither operand is an extract of higher half, so codegen may just use
5649 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5650 assert(N.getValueType() == MVT::i64 &&
5651 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5652 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5653 };
5654
5655 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5656 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5657
5658 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5659 }
5660 case Intrinsic::aarch64_neon_smax:
5661 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5662 Op.getOperand(1), Op.getOperand(2));
5663 case Intrinsic::aarch64_neon_umax:
5664 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5665 Op.getOperand(1), Op.getOperand(2));
5666 case Intrinsic::aarch64_neon_smin:
5667 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5668 Op.getOperand(1), Op.getOperand(2));
5669 case Intrinsic::aarch64_neon_umin:
5670 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5671 Op.getOperand(1), Op.getOperand(2));
5672 case Intrinsic::aarch64_neon_scalar_sqxtn:
5673 case Intrinsic::aarch64_neon_scalar_sqxtun:
5674 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5675 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5676 if (Op.getValueType() == MVT::i32)
5677 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5678 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5679 Op.getOperand(0),
5680 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5681 Op.getOperand(1))));
5682 return SDValue();
5683 }
5684 case Intrinsic::aarch64_sve_whilelo:
5685 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5686 /*IsEqual=*/false);
5687 case Intrinsic::aarch64_sve_whilelt:
5688 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5689 /*IsEqual=*/false);
5690 case Intrinsic::aarch64_sve_whilels:
5691 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5692 /*IsEqual=*/true);
5693 case Intrinsic::aarch64_sve_whilele:
5694 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5695 /*IsEqual=*/true);
5696 case Intrinsic::aarch64_sve_sunpkhi:
5697 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5698 Op.getOperand(1));
5699 case Intrinsic::aarch64_sve_sunpklo:
5700 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5701 Op.getOperand(1));
5702 case Intrinsic::aarch64_sve_uunpkhi:
5703 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5704 Op.getOperand(1));
5705 case Intrinsic::aarch64_sve_uunpklo:
5706 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5707 Op.getOperand(1));
5708 case Intrinsic::aarch64_sve_clasta_n:
5709 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5710 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5711 case Intrinsic::aarch64_sve_clastb_n:
5712 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5713 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5714 case Intrinsic::aarch64_sve_lasta:
5715 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5716 Op.getOperand(1), Op.getOperand(2));
5717 case Intrinsic::aarch64_sve_lastb:
5718 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5719 Op.getOperand(1), Op.getOperand(2));
5720 case Intrinsic::aarch64_sve_rev:
5721 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5722 Op.getOperand(1));
5723 case Intrinsic::aarch64_sve_tbl:
5724 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5725 Op.getOperand(1), Op.getOperand(2));
5726 case Intrinsic::aarch64_sve_trn1:
5727 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5728 Op.getOperand(1), Op.getOperand(2));
5729 case Intrinsic::aarch64_sve_trn2:
5730 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5731 Op.getOperand(1), Op.getOperand(2));
5732 case Intrinsic::aarch64_sve_uzp1:
5733 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5734 Op.getOperand(1), Op.getOperand(2));
5735 case Intrinsic::aarch64_sve_uzp2:
5736 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5737 Op.getOperand(1), Op.getOperand(2));
5738 case Intrinsic::aarch64_sve_zip1:
5739 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5740 Op.getOperand(1), Op.getOperand(2));
5741 case Intrinsic::aarch64_sve_zip2:
5742 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5743 Op.getOperand(1), Op.getOperand(2));
5744 case Intrinsic::aarch64_sve_splice:
5745 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5746 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5747 case Intrinsic::aarch64_sve_ptrue:
5748 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5749 case Intrinsic::aarch64_sve_clz:
5750 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5751 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5752 case Intrinsic::aarch64_sme_cntsb:
5753 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5754 DAG.getConstant(1, dl, MVT::i32));
5755 case Intrinsic::aarch64_sme_cntsh: {
5756 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5757 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5758 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5759 }
5760 case Intrinsic::aarch64_sme_cntsw: {
5761 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5762 DAG.getConstant(1, dl, MVT::i32));
5763 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5764 DAG.getConstant(2, dl, MVT::i32));
5765 }
5766 case Intrinsic::aarch64_sme_cntsd: {
5767 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5768 DAG.getConstant(1, dl, MVT::i32));
5769 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5770 DAG.getConstant(3, dl, MVT::i32));
5771 }
5772 case Intrinsic::aarch64_sve_cnt: {
5773 SDValue Data = Op.getOperand(3);
5774 // CTPOP only supports integer operands.
5775 if (Data.getValueType().isFloatingPoint())
5776 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5777 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5778 Op.getOperand(2), Data, Op.getOperand(1));
5779 }
5780 case Intrinsic::aarch64_sve_dupq_lane:
5781 return LowerDUPQLane(Op, DAG);
5782 case Intrinsic::aarch64_sve_convert_from_svbool:
5783 if (Op.getValueType() == MVT::aarch64svcount)
5784 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5785 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5786 case Intrinsic::aarch64_sve_convert_to_svbool:
5787 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5788 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5789 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5790 case Intrinsic::aarch64_sve_fneg:
5791 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5792 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5793 case Intrinsic::aarch64_sve_frintp:
5794 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5795 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5796 case Intrinsic::aarch64_sve_frintm:
5797 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5798 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5799 case Intrinsic::aarch64_sve_frinti:
5800 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5801 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5802 case Intrinsic::aarch64_sve_frintx:
5803 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5804 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5805 case Intrinsic::aarch64_sve_frinta:
5806 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5807 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5808 case Intrinsic::aarch64_sve_frintn:
5809 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5810 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5811 case Intrinsic::aarch64_sve_frintz:
5812 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5813 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5814 case Intrinsic::aarch64_sve_ucvtf:
5816 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5817 Op.getOperand(1));
5818 case Intrinsic::aarch64_sve_scvtf:
5820 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5821 Op.getOperand(1));
5822 case Intrinsic::aarch64_sve_fcvtzu:
5824 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5825 Op.getOperand(1));
5826 case Intrinsic::aarch64_sve_fcvtzs:
5828 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5829 Op.getOperand(1));
5830 case Intrinsic::aarch64_sve_fsqrt:
5831 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5832 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5833 case Intrinsic::aarch64_sve_frecpx:
5834 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5835 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5836 case Intrinsic::aarch64_sve_frecpe_x:
5837 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5838 Op.getOperand(1));
5839 case Intrinsic::aarch64_sve_frecps_x:
5840 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5841 Op.getOperand(1), Op.getOperand(2));
5842 case Intrinsic::aarch64_sve_frsqrte_x:
5843 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5844 Op.getOperand(1));
5845 case Intrinsic::aarch64_sve_frsqrts_x:
5846 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5847 Op.getOperand(1), Op.getOperand(2));
5848 case Intrinsic::aarch64_sve_fabs:
5849 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5850 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5851 case Intrinsic::aarch64_sve_abs:
5852 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5853 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5854 case Intrinsic::aarch64_sve_neg:
5855 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5856 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5857 case Intrinsic::aarch64_sve_insr: {
5858 SDValue Scalar = Op.getOperand(2);
5859 EVT ScalarTy = Scalar.getValueType();
5860 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5861 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5862
5863 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5864 Op.getOperand(1), Scalar);
5865 }
5866 case Intrinsic::aarch64_sve_rbit:
5868 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5869 Op.getOperand(1));
5870 case Intrinsic::aarch64_sve_revb:
5871 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5872 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5873 case Intrinsic::aarch64_sve_revh:
5874 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5875 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5876 case Intrinsic::aarch64_sve_revw:
5877 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5878 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5879 case Intrinsic::aarch64_sve_revd:
5880 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5881 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5882 case Intrinsic::aarch64_sve_sxtb:
5883 return DAG.getNode(
5885 Op.getOperand(2), Op.getOperand(3),
5886 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5887 Op.getOperand(1));
5888 case Intrinsic::aarch64_sve_sxth:
5889 return DAG.getNode(
5891 Op.getOperand(2), Op.getOperand(3),
5892 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5893 Op.getOperand(1));
5894 case Intrinsic::aarch64_sve_sxtw:
5895 return DAG.getNode(
5897 Op.getOperand(2), Op.getOperand(3),
5898 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5899 Op.getOperand(1));
5900 case Intrinsic::aarch64_sve_uxtb:
5901 return DAG.getNode(
5903 Op.getOperand(2), Op.getOperand(3),
5904 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5905 Op.getOperand(1));
5906 case Intrinsic::aarch64_sve_uxth:
5907 return DAG.getNode(
5909 Op.getOperand(2), Op.getOperand(3),
5910 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5911 Op.getOperand(1));
5912 case Intrinsic::aarch64_sve_uxtw:
5913 return DAG.getNode(
5915 Op.getOperand(2), Op.getOperand(3),
5916 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5917 Op.getOperand(1));
5918 case Intrinsic::localaddress: {
5919 const auto &MF = DAG.getMachineFunction();
5920 const auto *RegInfo = Subtarget->getRegisterInfo();
5921 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5922 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5923 Op.getSimpleValueType());
5924 }
5925
5926 case Intrinsic::eh_recoverfp: {
5927 // FIXME: This needs to be implemented to correctly handle highly aligned
5928 // stack objects. For now we simply return the incoming FP. Refer D53541
5929 // for more details.
5930 SDValue FnOp = Op.getOperand(1);
5931 SDValue IncomingFPOp = Op.getOperand(2);
5932 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5933 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5934 if (!Fn)
5936 "llvm.eh.recoverfp must take a function as the first argument");
5937 return IncomingFPOp;
5938 }
5939
5940 case Intrinsic::aarch64_neon_vsri:
5941 case Intrinsic::aarch64_neon_vsli:
5942 case Intrinsic::aarch64_sve_sri:
5943 case Intrinsic::aarch64_sve_sli: {
5944 EVT Ty = Op.getValueType();
5945
5946 if (!Ty.isVector())
5947 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5948
5949 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5950
5951 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5952 IntNo == Intrinsic::aarch64_sve_sri;
5953 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5954 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5955 Op.getOperand(3));
5956 }
5957
5958 case Intrinsic::aarch64_neon_srhadd:
5959 case Intrinsic::aarch64_neon_urhadd:
5960 case Intrinsic::aarch64_neon_shadd:
5961 case Intrinsic::aarch64_neon_uhadd: {
5962 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5963 IntNo == Intrinsic::aarch64_neon_shadd);
5964 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5965 IntNo == Intrinsic::aarch64_neon_urhadd);
5966 unsigned Opcode = IsSignedAdd
5967 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5968 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5969 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5970 Op.getOperand(2));
5971 }
5972 case Intrinsic::aarch64_neon_saddlp:
5973 case Intrinsic::aarch64_neon_uaddlp: {
5974 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5977 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5978 }
5979 case Intrinsic::aarch64_neon_sdot:
5980 case Intrinsic::aarch64_neon_udot:
5981 case Intrinsic::aarch64_sve_sdot:
5982 case Intrinsic::aarch64_sve_udot: {
5983 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5984 IntNo == Intrinsic::aarch64_sve_udot)
5987 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5988 Op.getOperand(2), Op.getOperand(3));
5989 }
5990 case Intrinsic::get_active_lane_mask: {
5991 SDValue ID =
5992 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5993
5994 EVT VT = Op.getValueType();
5995 if (VT.isScalableVector())
5996 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
5997 Op.getOperand(2));
5998
5999 // We can use the SVE whilelo instruction to lower this intrinsic by
6000 // creating the appropriate sequence of scalable vector operations and
6001 // then extracting a fixed-width subvector from the scalable vector.
6002
6003 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6004 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6005
6006 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6007 Op.getOperand(1), Op.getOperand(2));
6008 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6009 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6010 DAG.getVectorIdxConstant(0, dl));
6011 }
6012 case Intrinsic::aarch64_neon_uaddlv: {
6013 EVT OpVT = Op.getOperand(1).getValueType();
6014 EVT ResVT = Op.getValueType();
6015 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6016 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
6017 // In order to avoid insert_subvector, used v4i32 than v2i32.
6018 SDValue UADDLV =
6019 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
6020 SDValue EXTRACT_VEC_ELT =
6021 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
6022 DAG.getConstant(0, dl, MVT::i64));
6023 return EXTRACT_VEC_ELT;
6024 }
6025 return SDValue();
6026 }
6027 case Intrinsic::experimental_cttz_elts: {
6028 SDValue CttzOp = Op.getOperand(1);
6029 EVT VT = CttzOp.getValueType();
6030 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6031
6032 if (VT.isFixedLengthVector()) {
6033 // We can use SVE instructions to lower this intrinsic by first creating
6034 // an SVE predicate register mask from the fixed-width vector.
6035 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6036 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6037 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6038 }
6039
6040 SDValue NewCttzElts =
6041 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6042 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6043 }
6044 }
6045}
6046
6047bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6048 if (VT.getVectorElementType() == MVT::i8 ||
6049 VT.getVectorElementType() == MVT::i16) {
6050 EltTy = MVT::i32;
6051 return true;
6052 }
6053 return false;
6054}
6055
6056bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6057 EVT DataVT) const {
6058 const EVT IndexVT = Extend.getOperand(0).getValueType();
6059 // SVE only supports implicit extension of 32-bit indices.
6060 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6061 return false;
6062
6063 // Indices cannot be smaller than the main data type.
6064 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6065 return false;
6066
6067 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6068 // element container type, which would violate the previous clause.
6069 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6070}
6071
6072bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6073 EVT ExtVT = ExtVal.getValueType();
6074 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6075 return false;
6076
6077 // It may be worth creating extending masked loads if there are multiple
6078 // masked loads using the same predicate. That way we'll end up creating
6079 // extending masked loads that may then get split by the legaliser. This
6080 // results in just one set of predicate unpacks at the start, instead of
6081 // multiple sets of vector unpacks after each load.
6082 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6083 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6084 // Disable extending masked loads for fixed-width for now, since the code
6085 // quality doesn't look great.
6086 if (!ExtVT.isScalableVector())
6087 return false;
6088
6089 unsigned NumExtMaskedLoads = 0;
6090 for (auto *U : Ld->getMask()->uses())
6091 if (isa<MaskedLoadSDNode>(U))
6092 NumExtMaskedLoads++;
6093
6094 if (NumExtMaskedLoads <= 1)
6095 return false;
6096 }
6097 }
6098
6099 return true;
6100}
6101
6102unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6103 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6104 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6106 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6108 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6110 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6112 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6114 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6116 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6118 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6120 };
6121 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6122 return AddrModes.find(Key)->second;
6123}
6124
6125unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6126 switch (Opcode) {
6127 default:
6128 llvm_unreachable("unimplemented opcode");
6129 return Opcode;
6144 }
6145}
6146
6147SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6148 SelectionDAG &DAG) const {
6149 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6150
6151 SDLoc DL(Op);
6152 SDValue Chain = MGT->getChain();
6153 SDValue PassThru = MGT->getPassThru();
6154 SDValue Mask = MGT->getMask();
6155 SDValue BasePtr = MGT->getBasePtr();
6156 SDValue Index = MGT->getIndex();
6157 SDValue Scale = MGT->getScale();
6158 EVT VT = Op.getValueType();
6159 EVT MemVT = MGT->getMemoryVT();
6160 ISD::LoadExtType ExtType = MGT->getExtensionType();
6161 ISD::MemIndexType IndexType = MGT->getIndexType();
6162
6163 // SVE supports zero (and so undef) passthrough values only, everything else
6164 // must be handled manually by an explicit select on the load's output.
6165 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6166 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6167 SDValue Load =
6168 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6169 MGT->getMemOperand(), IndexType, ExtType);
6170 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6171 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6172 }
6173
6174 bool IsScaled = MGT->isIndexScaled();
6175 bool IsSigned = MGT->isIndexSigned();
6176
6177 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6178 // must be calculated before hand.
6179 uint64_t ScaleVal = Scale->getAsZExtVal();
6180 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6181 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6182 EVT IndexVT = Index.getValueType();
6183 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6184 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6185 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6186
6187 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6188 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6189 MGT->getMemOperand(), IndexType, ExtType);
6190 }
6191
6192 // Lower fixed length gather to a scalable equivalent.
6193 if (VT.isFixedLengthVector()) {
6194 assert(Subtarget->useSVEForFixedLengthVectors() &&
6195 "Cannot lower when not using SVE for fixed vectors!");
6196
6197 // NOTE: Handle floating-point as if integer then bitcast the result.
6199 MemVT = MemVT.changeVectorElementTypeToInteger();
6200
6201 // Find the smallest integer fixed length vector we can use for the gather.
6202 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6203 if (DataVT.getVectorElementType() == MVT::i64 ||
6204 Index.getValueType().getVectorElementType() == MVT::i64 ||
6205 Mask.getValueType().getVectorElementType() == MVT::i64)
6206 PromotedVT = VT.changeVectorElementType(MVT::i64);
6207
6208 // Promote vector operands except for passthrough, which we know is either
6209 // undef or zero, and thus best constructed directly.
6210 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6211 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6212 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6213
6214 // A promoted result type forces the need for an extending load.
6215 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6216 ExtType = ISD::EXTLOAD;
6217
6218 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6219
6220 // Convert fixed length vector operands to scalable.
6221 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6222 Index = convertToScalableVector(DAG, ContainerVT, Index);
6224 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6225 : DAG.getConstant(0, DL, ContainerVT);
6226
6227 // Emit equivalent scalable vector gather.
6228 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6229 SDValue Load =
6230 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6231 Ops, MGT->getMemOperand(), IndexType, ExtType);
6232
6233 // Extract fixed length data then convert to the required result type.
6234 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6235 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6236 if (VT.isFloatingPoint())
6237 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6238
6239 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6240 }
6241
6242 // Everything else is legal.
6243 return Op;
6244}
6245
6246SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6247 SelectionDAG &DAG) const {
6248 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6249
6250 SDLoc DL(Op);
6251 SDValue Chain = MSC->getChain();
6252 SDValue StoreVal = MSC->getValue();
6253 SDValue Mask = MSC->getMask();
6254 SDValue BasePtr = MSC->getBasePtr();
6255 SDValue Index = MSC->getIndex();
6256 SDValue Scale = MSC->getScale();
6257 EVT VT = StoreVal.getValueType();
6258 EVT MemVT = MSC->getMemoryVT();
6259 ISD::MemIndexType IndexType = MSC->getIndexType();
6260 bool Truncating = MSC->isTruncatingStore();
6261
6262 bool IsScaled = MSC->isIndexScaled();
6263 bool IsSigned = MSC->isIndexSigned();
6264
6265 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6266 // must be calculated before hand.
6267 uint64_t ScaleVal = Scale->getAsZExtVal();
6268 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6269 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6270 EVT IndexVT = Index.getValueType();
6271 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6272 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6273 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6274
6275 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6276 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6277 MSC->getMemOperand(), IndexType, Truncating);
6278 }
6279
6280 // Lower fixed length scatter to a scalable equivalent.
6281 if (VT.isFixedLengthVector()) {
6282 assert(Subtarget->useSVEForFixedLengthVectors() &&
6283 "Cannot lower when not using SVE for fixed vectors!");
6284
6285 // Once bitcast we treat floating-point scatters as if integer.
6286 if (VT.isFloatingPoint()) {
6288 MemVT = MemVT.changeVectorElementTypeToInteger();
6289 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6290 }
6291
6292 // Find the smallest integer fixed length vector we can use for the scatter.
6293 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6294 if (VT.getVectorElementType() == MVT::i64 ||
6295 Index.getValueType().getVectorElementType() == MVT::i64 ||
6296 Mask.getValueType().getVectorElementType() == MVT::i64)
6297 PromotedVT = VT.changeVectorElementType(MVT::i64);
6298
6299 // Promote vector operands.
6300 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6301 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6302 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6303 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6304
6305 // A promoted value type forces the need for a truncating store.
6306 if (PromotedVT != VT)
6307 Truncating = true;
6308
6309 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6310
6311 // Convert fixed length vector operands to scalable.
6312 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6313 Index = convertToScalableVector(DAG, ContainerVT, Index);
6315 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6316
6317 // Emit equivalent scalable vector scatter.
6318 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6319 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6320 MSC->getMemOperand(), IndexType, Truncating);
6321 }
6322
6323 // Everything else is legal.
6324 return Op;
6325}
6326
6327SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6328 SDLoc DL(Op);
6329 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6330 assert(LoadNode && "Expected custom lowering of a masked load node");
6331 EVT VT = Op->getValueType(0);
6332
6333 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6334 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6335
6336 SDValue PassThru = LoadNode->getPassThru();
6337 SDValue Mask = LoadNode->getMask();
6338
6339 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6340 return Op;
6341
6343 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6344 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6345 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6346 LoadNode->getExtensionType());
6347
6348 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6349
6350 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6351}
6352
6353// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6355 EVT VT, EVT MemVT,
6356 SelectionDAG &DAG) {
6357 assert(VT.isVector() && "VT should be a vector type");
6358 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6359
6360 SDValue Value = ST->getValue();
6361
6362 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6363 // the word lane which represent the v4i8 subvector. It optimizes the store
6364 // to:
6365 //
6366 // xtn v0.8b, v0.8h
6367 // str s0, [x0]
6368
6369 SDValue Undef = DAG.getUNDEF(MVT::i16);
6370 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6371 {Undef, Undef, Undef, Undef});
6372
6373 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6374 Value, UndefVec);
6375 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6376
6377 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6378 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6379 Trunc, DAG.getConstant(0, DL, MVT::i64));
6380
6381 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6382 ST->getBasePtr(), ST->getMemOperand());
6383}
6384
6385// Custom lowering for any store, vector or scalar and/or default or with
6386// a truncate operations. Currently only custom lower truncate operation
6387// from vector v4i16 to v4i8 or volatile stores of i128.
6388SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6389 SelectionDAG &DAG) const {
6390 SDLoc Dl(Op);
6391 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6392 assert (StoreNode && "Can only custom lower store nodes");
6393
6394 SDValue Value = StoreNode->getValue();
6395
6396 EVT VT = Value.getValueType();
6397 EVT MemVT = StoreNode->getMemoryVT();
6398
6399 if (VT.isVector()) {
6401 VT,
6402 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6403 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6404
6405 unsigned AS = StoreNode->getAddressSpace();
6406 Align Alignment = StoreNode->getAlign();
6407 if (Alignment < MemVT.getStoreSize() &&
6408 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6409 StoreNode->getMemOperand()->getFlags(),
6410 nullptr)) {
6411 return scalarizeVectorStore(StoreNode, DAG);
6412 }
6413
6414 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6415 MemVT == MVT::v4i8) {
6416 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6417 }
6418 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6419 // the custom lowering, as there are no un-paired non-temporal stores and
6420 // legalization will break up 256 bit inputs.
6422 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6423 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6424 (MemVT.getScalarSizeInBits() == 8u ||
6425 MemVT.getScalarSizeInBits() == 16u ||
6426 MemVT.getScalarSizeInBits() == 32u ||
6427 MemVT.getScalarSizeInBits() == 64u)) {
6428 SDValue Lo =
6431 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6432 SDValue Hi =
6435 StoreNode->getValue(),
6436 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6438 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6439 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6440 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6441 return Result;
6442 }
6443 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6444 return LowerStore128(Op, DAG);
6445 } else if (MemVT == MVT::i64x8) {
6446 SDValue Value = StoreNode->getValue();
6447 assert(Value->getValueType(0) == MVT::i64x8);
6448 SDValue Chain = StoreNode->getChain();
6449 SDValue Base = StoreNode->getBasePtr();
6450 EVT PtrVT = Base.getValueType();
6451 for (unsigned i = 0; i < 8; i++) {
6452 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6453 Value, DAG.getConstant(i, Dl, MVT::i32));
6454 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6455 DAG.getConstant(i * 8, Dl, PtrVT));
6456 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6457 StoreNode->getOriginalAlign());
6458 }
6459 return Chain;
6460 }
6461
6462 return SDValue();
6463}
6464
6465/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6466SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6467 SelectionDAG &DAG) const {
6468 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6469 assert(StoreNode->getMemoryVT() == MVT::i128);
6470 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6471
6472 bool IsStoreRelease =
6474 if (StoreNode->isAtomic())
6475 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6476 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6479
6480 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6481 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6482 ? StoreNode->getOperand(1)
6483 : StoreNode->getOperand(2);
6484 SDLoc DL(Op);
6485 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6486 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6487 if (DAG.getDataLayout().isBigEndian())
6488 std::swap(StoreValue.first, StoreValue.second);
6490 Opcode, DL, DAG.getVTList(MVT::Other),
6491 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6492 StoreNode->getBasePtr()},
6493 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6494 return Result;
6495}
6496
6497SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6498 SelectionDAG &DAG) const {
6499 SDLoc DL(Op);
6500 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6501 assert(LoadNode && "Expected custom lowering of a load node");
6502
6503 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6505 SDValue Base = LoadNode->getBasePtr();
6506 SDValue Chain = LoadNode->getChain();
6507 EVT PtrVT = Base.getValueType();
6508 for (unsigned i = 0; i < 8; i++) {
6509 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6510 DAG.getConstant(i * 8, DL, PtrVT));
6511 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6512 LoadNode->getPointerInfo(),
6513 LoadNode->getOriginalAlign());
6514 Ops.push_back(Part);
6515 Chain = SDValue(Part.getNode(), 1);
6516 }
6517 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6518 return DAG.getMergeValues({Loaded, Chain}, DL);
6519 }
6520
6521 // Custom lowering for extending v4i8 vector loads.
6522 EVT VT = Op->getValueType(0);
6523 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6524
6525 if (LoadNode->getMemoryVT() != MVT::v4i8)
6526 return SDValue();
6527
6528 // Avoid generating unaligned loads.
6529 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
6530 return SDValue();
6531
6532 unsigned ExtType;
6533 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6534 ExtType = ISD::SIGN_EXTEND;
6535 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6536 LoadNode->getExtensionType() == ISD::EXTLOAD)
6537 ExtType = ISD::ZERO_EXTEND;
6538 else
6539 return SDValue();
6540
6541 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6542 LoadNode->getBasePtr(), MachinePointerInfo());
6543 SDValue Chain = Load.getValue(1);
6544 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6545 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6546 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6547 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6548 DAG.getConstant(0, DL, MVT::i64));
6549 if (VT == MVT::v4i32)
6550 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6551 return DAG.getMergeValues({Ext, Chain}, DL);
6552}
6553
6554// Generate SUBS and CSEL for integer abs.
6555SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6556 MVT VT = Op.getSimpleValueType();
6557
6558 if (VT.isVector())
6559 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6560
6561 SDLoc DL(Op);
6562 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6563 Op.getOperand(0));
6564 // Generate SUBS & CSEL.
6565 SDValue Cmp =
6566 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6567 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6568 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6569 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6570 Cmp.getValue(1));
6571}
6572
6574 SDValue Chain = Op.getOperand(0);
6575 SDValue Cond = Op.getOperand(1);
6576 SDValue Dest = Op.getOperand(2);
6577
6579 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6580 SDLoc dl(Op);
6581 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6582 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6583 Cmp);
6584 }
6585
6586 return SDValue();
6587}
6588
6589// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6590// FSHL is converted to FSHR before deciding what to do with it
6592 SDValue Shifts = Op.getOperand(2);
6593 // Check if the shift amount is a constant
6594 // If opcode is FSHL, convert it to FSHR
6595 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6596 SDLoc DL(Op);
6597 MVT VT = Op.getSimpleValueType();
6598
6599 if (Op.getOpcode() == ISD::FSHL) {
6600 unsigned int NewShiftNo =
6601 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6602 return DAG.getNode(
6603 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6604 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6605 } else if (Op.getOpcode() == ISD::FSHR) {
6606 return Op;
6607 }
6608 }
6609
6610 return SDValue();
6611}
6612
6614 SDValue X = Op.getOperand(0);
6615 EVT XScalarTy = X.getValueType();
6616 SDValue Exp = Op.getOperand(1);
6617
6618 SDLoc DL(Op);
6619 EVT XVT, ExpVT;
6620 switch (Op.getSimpleValueType().SimpleTy) {
6621 default:
6622 return SDValue();
6623 case MVT::bf16:
6624 case MVT::f16:
6625 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6626 [[fallthrough]];
6627 case MVT::f32:
6628 XVT = MVT::nxv4f32;
6629 ExpVT = MVT::nxv4i32;
6630 break;
6631 case MVT::f64:
6632 XVT = MVT::nxv2f64;
6633 ExpVT = MVT::nxv2i64;
6634 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6635 break;
6636 }
6637
6638 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6639 SDValue VX =
6640 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6641 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6642 DAG.getUNDEF(ExpVT), Exp, Zero);
6643 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6644 AArch64SVEPredPattern::all);
6645 SDValue FScale =
6647 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6648 VPg, VX, VExp);
6649 SDValue Final =
6650 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6651 if (X.getValueType() != XScalarTy)
6652 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6653 DAG.getIntPtrConstant(1, SDLoc(Op)));
6654 return Final;
6655}
6656
6658 SelectionDAG &DAG) const {
6659 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6660 LLVM_DEBUG(Op.dump());
6661
6662 switch (Op.getOpcode()) {
6663 default:
6664 llvm_unreachable("unimplemented operand");
6665 return SDValue();
6666 case ISD::BITCAST:
6667 return LowerBITCAST(Op, DAG);
6668 case ISD::GlobalAddress:
6669 return LowerGlobalAddress(Op, DAG);
6671 return LowerGlobalTLSAddress(Op, DAG);
6673 return LowerPtrAuthGlobalAddress(Op, DAG);
6674 case ISD::SETCC:
6675 case ISD::STRICT_FSETCC:
6677 return LowerSETCC(Op, DAG);
6678 case ISD::SETCCCARRY:
6679 return LowerSETCCCARRY(Op, DAG);
6680 case ISD::BRCOND:
6681 return LowerBRCOND(Op, DAG);
6682 case ISD::BR_CC:
6683 return LowerBR_CC(Op, DAG);
6684 case ISD::SELECT:
6685 return LowerSELECT(Op, DAG);
6686 case ISD::SELECT_CC:
6687 return LowerSELECT_CC(Op, DAG);
6688 case ISD::JumpTable:
6689 return LowerJumpTable(Op, DAG);
6690 case ISD::BR_JT:
6691 return LowerBR_JT(Op, DAG);
6692 case ISD::ConstantPool:
6693 return LowerConstantPool(Op, DAG);
6694 case ISD::BlockAddress:
6695 return LowerBlockAddress(Op, DAG);
6696 case ISD::VASTART:
6697 return LowerVASTART(Op, DAG);
6698 case ISD::VACOPY:
6699 return LowerVACOPY(Op, DAG);
6700 case ISD::VAARG:
6701 return LowerVAARG(Op, DAG);
6702 case ISD::UADDO_CARRY:
6703 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6704 case ISD::USUBO_CARRY:
6705 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6706 case ISD::SADDO_CARRY:
6707 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6708 case ISD::SSUBO_CARRY:
6709 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6710 case ISD::SADDO:
6711 case ISD::UADDO:
6712 case ISD::SSUBO:
6713 case ISD::USUBO:
6714 case ISD::SMULO:
6715 case ISD::UMULO:
6716 return LowerXALUO(Op, DAG);
6717 case ISD::FADD:
6718 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6719 case ISD::FSUB:
6720 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6721 case ISD::FMUL:
6722 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6723 case ISD::FMA:
6724 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6725 case ISD::FDIV:
6726 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6727 case ISD::FNEG:
6728 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6729 case ISD::FCEIL:
6730 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6731 case ISD::FFLOOR:
6732 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6733 case ISD::FNEARBYINT:
6734 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6735 case ISD::FRINT:
6736 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6737 case ISD::FROUND:
6738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6739 case ISD::FROUNDEVEN:
6740 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6741 case ISD::FTRUNC:
6742 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6743 case ISD::FSQRT:
6744 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6745 case ISD::FABS:
6746 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6747 case ISD::FP_ROUND:
6749 return LowerFP_ROUND(Op, DAG);
6750 case ISD::FP_EXTEND:
6751 return LowerFP_EXTEND(Op, DAG);
6752 case ISD::FRAMEADDR:
6753 return LowerFRAMEADDR(Op, DAG);
6754 case ISD::SPONENTRY:
6755 return LowerSPONENTRY(Op, DAG);
6756 case ISD::RETURNADDR:
6757 return LowerRETURNADDR(Op, DAG);
6759 return LowerADDROFRETURNADDR(Op, DAG);
6761 return LowerCONCAT_VECTORS(Op, DAG);
6763 return LowerINSERT_VECTOR_ELT(Op, DAG);
6765 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6766 case ISD::BUILD_VECTOR:
6767 return LowerBUILD_VECTOR(Op, DAG);
6769 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6771 return LowerVECTOR_SHUFFLE(Op, DAG);
6772 case ISD::SPLAT_VECTOR:
6773 return LowerSPLAT_VECTOR(Op, DAG);
6775 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6777 return LowerINSERT_SUBVECTOR(Op, DAG);
6778 case ISD::SDIV:
6779 case ISD::UDIV:
6780 return LowerDIV(Op, DAG);
6781 case ISD::SMIN:
6782 case ISD::UMIN:
6783 case ISD::SMAX:
6784 case ISD::UMAX:
6785 return LowerMinMax(Op, DAG);
6786 case ISD::SRA:
6787 case ISD::SRL:
6788 case ISD::SHL:
6789 return LowerVectorSRA_SRL_SHL(Op, DAG);
6790 case ISD::SHL_PARTS:
6791 case ISD::SRL_PARTS:
6792 case ISD::SRA_PARTS:
6793 return LowerShiftParts(Op, DAG);
6794 case ISD::CTPOP:
6795 case ISD::PARITY:
6796 return LowerCTPOP_PARITY(Op, DAG);
6797 case ISD::FCOPYSIGN:
6798 return LowerFCOPYSIGN(Op, DAG);
6799 case ISD::OR:
6800 return LowerVectorOR(Op, DAG);
6801 case ISD::XOR:
6802 return LowerXOR(Op, DAG);
6803 case ISD::PREFETCH:
6804 return LowerPREFETCH(Op, DAG);
6805 case ISD::SINT_TO_FP:
6806 case ISD::UINT_TO_FP:
6809 return LowerINT_TO_FP(Op, DAG);
6810 case ISD::FP_TO_SINT:
6811 case ISD::FP_TO_UINT:
6814 return LowerFP_TO_INT(Op, DAG);
6817 return LowerFP_TO_INT_SAT(Op, DAG);
6818 case ISD::FSINCOS:
6819 return LowerFSINCOS(Op, DAG);
6820 case ISD::GET_ROUNDING:
6821 return LowerGET_ROUNDING(Op, DAG);
6822 case ISD::SET_ROUNDING:
6823 return LowerSET_ROUNDING(Op, DAG);
6824 case ISD::GET_FPMODE:
6825 return LowerGET_FPMODE(Op, DAG);
6826 case ISD::SET_FPMODE:
6827 return LowerSET_FPMODE(Op, DAG);
6828 case ISD::RESET_FPMODE:
6829 return LowerRESET_FPMODE(Op, DAG);
6830 case ISD::MUL:
6831 return LowerMUL(Op, DAG);
6832 case ISD::MULHS:
6833 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6834 case ISD::MULHU:
6835 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6837 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6839 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6841 return LowerINTRINSIC_VOID(Op, DAG);
6842 case ISD::ATOMIC_STORE:
6843 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6844 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6845 return LowerStore128(Op, DAG);
6846 }
6847 return SDValue();
6848 case ISD::STORE:
6849 return LowerSTORE(Op, DAG);
6850 case ISD::MSTORE:
6851 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6852 case ISD::MGATHER:
6853 return LowerMGATHER(Op, DAG);
6854 case ISD::MSCATTER:
6855 return LowerMSCATTER(Op, DAG);
6857 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6858 case ISD::VECREDUCE_ADD:
6859 case ISD::VECREDUCE_AND:
6860 case ISD::VECREDUCE_OR:
6861 case ISD::VECREDUCE_XOR:
6871 return LowerVECREDUCE(Op, DAG);
6873 return LowerATOMIC_LOAD_AND(Op, DAG);
6875 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6876 case ISD::VSCALE:
6877 return LowerVSCALE(Op, DAG);
6878 case ISD::ANY_EXTEND:
6879 case ISD::SIGN_EXTEND:
6880 case ISD::ZERO_EXTEND:
6881 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6883 // Only custom lower when ExtraVT has a legal byte based element type.
6884 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6885 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6886 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6887 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6888 return SDValue();
6889
6890 return LowerToPredicatedOp(Op, DAG,
6892 }
6893 case ISD::TRUNCATE:
6894 return LowerTRUNCATE(Op, DAG);
6895 case ISD::MLOAD:
6896 return LowerMLOAD(Op, DAG);
6897 case ISD::LOAD:
6898 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6899 !Subtarget->isNeonAvailable()))
6900 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6901 return LowerLOAD(Op, DAG);
6902 case ISD::ADD:
6903 case ISD::AND:
6904 case ISD::SUB:
6905 return LowerToScalableOp(Op, DAG);
6906 case ISD::FMAXIMUM:
6907 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6908 case ISD::FMAXNUM:
6909 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6910 case ISD::FMINIMUM:
6911 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6912 case ISD::FMINNUM:
6913 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6914 case ISD::VSELECT:
6915 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6916 case ISD::ABS:
6917 return LowerABS(Op, DAG);
6918 case ISD::ABDS:
6919 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6920 case ISD::ABDU:
6921 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6922 case ISD::AVGFLOORS:
6923 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6924 case ISD::AVGFLOORU:
6925 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6926 case ISD::AVGCEILS:
6927 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6928 case ISD::AVGCEILU:
6929 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6930 case ISD::BITREVERSE:
6931 return LowerBitreverse(Op, DAG);
6932 case ISD::BSWAP:
6933 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6934 case ISD::CTLZ:
6935 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6936 case ISD::CTTZ:
6937 return LowerCTTZ(Op, DAG);
6938 case ISD::VECTOR_SPLICE:
6939 return LowerVECTOR_SPLICE(Op, DAG);
6941 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6943 return LowerVECTOR_INTERLEAVE(Op, DAG);
6944 case ISD::LRINT:
6945 case ISD::LLRINT:
6946 if (Op.getValueType().isVector())
6947 return LowerVectorXRINT(Op, DAG);
6948 [[fallthrough]];
6949 case ISD::LROUND:
6950 case ISD::LLROUND: {
6951 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6952 Op.getOperand(0).getValueType() == MVT::bf16) &&
6953 "Expected custom lowering of rounding operations only for f16");
6954 SDLoc DL(Op);
6955 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6956 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6957 }
6958 case ISD::STRICT_LROUND:
6960 case ISD::STRICT_LRINT:
6961 case ISD::STRICT_LLRINT: {
6962 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6963 Op.getOperand(1).getValueType() == MVT::bf16) &&
6964 "Expected custom lowering of rounding operations only for f16");
6965 SDLoc DL(Op);
6966 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6967 {Op.getOperand(0), Op.getOperand(1)});
6968 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6969 {Ext.getValue(1), Ext.getValue(0)});
6970 }
6971 case ISD::WRITE_REGISTER: {
6972 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6973 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6974 SDLoc DL(Op);
6975
6976 SDValue Chain = Op.getOperand(0);
6977 SDValue SysRegName = Op.getOperand(1);
6978 std::pair<SDValue, SDValue> Pair =
6979 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6980
6981 // chain = MSRR(chain, sysregname, lo, hi)
6982 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6983 SysRegName, Pair.first, Pair.second);
6984
6985 return Result;
6986 }
6987 case ISD::FSHL:
6988 case ISD::FSHR:
6989 return LowerFunnelShift(Op, DAG);
6990 case ISD::FLDEXP:
6991 return LowerFLDEXP(Op, DAG);
6993 return LowerVECTOR_HISTOGRAM(Op, DAG);
6994 }
6995}
6996
6998 return !Subtarget->useSVEForFixedLengthVectors();
6999}
7000
7002 EVT VT, bool OverrideNEON) const {
7003 if (!VT.isFixedLengthVector() || !VT.isSimple())
7004 return false;
7005
7006 // Don't use SVE for vectors we cannot scalarize if required.
7007 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7008 // Fixed length predicates should be promoted to i8.
7009 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7010 case MVT::i1:
7011 default:
7012 return false;
7013 case MVT::i8:
7014 case MVT::i16:
7015 case MVT::i32:
7016 case MVT::i64:
7017 case MVT::f16:
7018 case MVT::f32:
7019 case MVT::f64:
7020 break;
7021 }
7022
7023 // NEON-sized vectors can be emulated using SVE instructions.
7024 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7025 return Subtarget->isSVEorStreamingSVEAvailable();
7026
7027 // Ensure NEON MVTs only belong to a single register class.
7028 if (VT.getFixedSizeInBits() <= 128)
7029 return false;
7030
7031 // Ensure wider than NEON code generation is enabled.
7032 if (!Subtarget->useSVEForFixedLengthVectors())
7033 return false;
7034
7035 // Don't use SVE for types that don't fit.
7036 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7037 return false;
7038
7039 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7040 // the base fixed length SVE support in place.
7041 if (!VT.isPow2VectorType())
7042 return false;
7043
7044 return true;
7045}
7046
7047//===----------------------------------------------------------------------===//
7048// Calling Convention Implementation
7049//===----------------------------------------------------------------------===//
7050
7051static unsigned getIntrinsicID(const SDNode *N) {
7052 unsigned Opcode = N->getOpcode();
7053 switch (Opcode) {
7054 default:
7057 unsigned IID = N->getConstantOperandVal(0);
7058 if (IID < Intrinsic::num_intrinsics)
7059 return IID;
7061 }
7062 }
7063}
7064
7066 SDValue N1) const {
7067 if (!N0.hasOneUse())
7068 return false;
7069
7070 unsigned IID = getIntrinsicID(N1.getNode());
7071 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7072 if (IID == Intrinsic::aarch64_neon_umull ||
7073 N1.getOpcode() == AArch64ISD::UMULL ||
7074 IID == Intrinsic::aarch64_neon_smull ||
7076 return N0.getOpcode() != ISD::ADD;
7077
7078 return true;
7079}
7080
7081/// Selects the correct CCAssignFn for a given CallingConvention value.
7083 bool IsVarArg) const {
7084 switch (CC) {
7085 default:
7086 report_fatal_error("Unsupported calling convention.");
7087 case CallingConv::GHC:
7088 return CC_AArch64_GHC;
7091 case CallingConv::C:
7092 case CallingConv::Fast:
7096 case CallingConv::Swift:
7098 case CallingConv::Tail:
7099 case CallingConv::GRAAL:
7100 if (Subtarget->isTargetWindows()) {
7101 if (IsVarArg) {
7102 if (Subtarget->isWindowsArm64EC())
7105 }
7106 return CC_AArch64_Win64PCS;
7107 }
7108 if (!Subtarget->isTargetDarwin())
7109 return CC_AArch64_AAPCS;
7110 if (!IsVarArg)
7111 return CC_AArch64_DarwinPCS;
7114 case CallingConv::Win64:
7115 if (IsVarArg) {
7116 if (Subtarget->isWindowsArm64EC())
7119 }
7120 return CC_AArch64_Win64PCS;
7122 if (Subtarget->isWindowsArm64EC())
7129 return CC_AArch64_AAPCS;
7134 }
7135}
7136
7137CCAssignFn *
7139 switch (CC) {
7140 default:
7141 return RetCC_AArch64_AAPCS;
7145 if (Subtarget->isWindowsArm64EC())
7147 return RetCC_AArch64_AAPCS;
7148 }
7149}
7150
7151static bool isPassedInFPR(EVT VT) {
7152 return VT.isFixedLengthVector() ||
7153 (VT.isFloatingPoint() && !VT.isScalableVector());
7154}
7155
7156SDValue AArch64TargetLowering::LowerFormalArguments(
7157 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7158 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7159 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7161 const Function &F = MF.getFunction();
7162 MachineFrameInfo &MFI = MF.getFrameInfo();
7163 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
7164 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7165 (isVarArg && Subtarget->isWindowsArm64EC());
7167
7169 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7171 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7172 FuncInfo->setIsSVECC(true);
7173
7174 // Assign locations to all of the incoming arguments.
7176 DenseMap<unsigned, SDValue> CopiedRegs;
7177 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7178
7179 // At this point, Ins[].VT may already be promoted to i32. To correctly
7180 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7181 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7182 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7183 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7184 // LocVT.
7185 unsigned NumArgs = Ins.size();
7186 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7187 unsigned CurArgIdx = 0;
7188 for (unsigned i = 0; i != NumArgs; ++i) {
7189 MVT ValVT = Ins[i].VT;
7190 if (Ins[i].isOrigArg()) {
7191 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7192 CurArgIdx = Ins[i].getOrigArgIndex();
7193
7194 // Get type of the original argument.
7195 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7196 /*AllowUnknown*/ true);
7197 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7198 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7199 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7200 ValVT = MVT::i8;
7201 else if (ActualMVT == MVT::i16)
7202 ValVT = MVT::i16;
7203 }
7204 bool UseVarArgCC = false;
7205 if (IsWin64)
7206 UseVarArgCC = isVarArg;
7207 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7208 bool Res =
7209 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7210 assert(!Res && "Call operand has unhandled type");
7211 (void)Res;
7212 }
7213
7215 bool IsLocallyStreaming =
7216 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7217 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7218 SDValue Glue = Chain.getValue(1);
7219
7220 SmallVector<SDValue, 16> ArgValues;
7221 unsigned ExtraArgLocs = 0;
7222 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7223 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7224
7225 if (Ins[i].Flags.isByVal()) {
7226 // Byval is used for HFAs in the PCS, but the system should work in a
7227 // non-compliant manner for larger structs.
7228 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7229 int Size = Ins[i].Flags.getByValSize();
7230 unsigned NumRegs = (Size + 7) / 8;
7231
7232 // FIXME: This works on big-endian for composite byvals, which are the common
7233 // case. It should also work for fundamental types too.
7234 unsigned FrameIdx =
7235 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7236 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7237 InVals.push_back(FrameIdxN);
7238
7239 continue;
7240 }
7241
7242 if (Ins[i].Flags.isSwiftAsync())
7244
7245 SDValue ArgValue;
7246 if (VA.isRegLoc()) {
7247 // Arguments stored in registers.
7248 EVT RegVT = VA.getLocVT();
7249 const TargetRegisterClass *RC;
7250
7251 if (RegVT == MVT::i32)
7252 RC = &AArch64::GPR32RegClass;
7253 else if (RegVT == MVT::i64)
7254 RC = &AArch64::GPR64RegClass;
7255 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7256 RC = &AArch64::FPR16RegClass;
7257 else if (RegVT == MVT::f32)
7258 RC = &AArch64::FPR32RegClass;
7259 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7260 RC = &AArch64::FPR64RegClass;
7261 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7262 RC = &AArch64::FPR128RegClass;
7263 else if (RegVT.isScalableVector() &&
7264 RegVT.getVectorElementType() == MVT::i1) {
7265 FuncInfo->setIsSVECC(true);
7266 RC = &AArch64::PPRRegClass;
7267 } else if (RegVT == MVT::aarch64svcount) {
7268 FuncInfo->setIsSVECC(true);
7269 RC = &AArch64::PPRRegClass;
7270 } else if (RegVT.isScalableVector()) {
7271 FuncInfo->setIsSVECC(true);
7272 RC = &AArch64::ZPRRegClass;
7273 } else
7274 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7275
7276 // Transform the arguments in physical registers into virtual ones.
7277 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7278
7279 if (IsLocallyStreaming) {
7280 // LocallyStreamingFunctions must insert the SMSTART in the correct
7281 // position, so we use Glue to ensure no instructions can be scheduled
7282 // between the chain of:
7283 // t0: ch,glue = EntryNode
7284 // t1: res,ch,glue = CopyFromReg
7285 // ...
7286 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7287 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7288 // ^^^^^^
7289 // This will be the new Chain/Root node.
7290 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7291 Glue = ArgValue.getValue(2);
7292 if (isPassedInFPR(ArgValue.getValueType())) {
7293 ArgValue =
7295 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7296 {ArgValue, Glue});
7297 Glue = ArgValue.getValue(1);
7298 }
7299 } else
7300 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7301
7302 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7303 // to 64 bits. Insert an assert[sz]ext to capture this, then
7304 // truncate to the right size.
7305 switch (VA.getLocInfo()) {
7306 default:
7307 llvm_unreachable("Unknown loc info!");
7308 case CCValAssign::Full:
7309 break;
7311 assert(
7312 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7313 "Indirect arguments should be scalable on most subtargets");
7314 break;
7315 case CCValAssign::BCvt:
7316 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7317 break;
7318 case CCValAssign::AExt:
7319 case CCValAssign::SExt:
7320 case CCValAssign::ZExt:
7321 break;
7323 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7324 DAG.getConstant(32, DL, RegVT));
7325 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7326 break;
7327 }
7328 } else { // VA.isRegLoc()
7329 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7330 unsigned ArgOffset = VA.getLocMemOffset();
7331 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7332 ? VA.getLocVT().getSizeInBits()
7333 : VA.getValVT().getSizeInBits()) / 8;
7334
7335 uint32_t BEAlign = 0;
7336 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7337 !Ins[i].Flags.isInConsecutiveRegs())
7338 BEAlign = 8 - ArgSize;
7339
7340 SDValue FIN;
7341 MachinePointerInfo PtrInfo;
7342 if (StackViaX4) {
7343 // In both the ARM64EC varargs convention and the thunk convention,
7344 // arguments on the stack are accessed relative to x4, not sp. In
7345 // the thunk convention, there's an additional offset of 32 bytes
7346 // to account for the shadow store.
7347 unsigned ObjOffset = ArgOffset + BEAlign;
7348 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7349 ObjOffset += 32;
7350 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7351 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7352 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7353 DAG.getConstant(ObjOffset, DL, MVT::i64));
7355 } else {
7356 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7357
7358 // Create load nodes to retrieve arguments from the stack.
7359 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7360 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7361 }
7362
7363 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7365 MVT MemVT = VA.getValVT();
7366
7367 switch (VA.getLocInfo()) {
7368 default:
7369 break;
7370 case CCValAssign::Trunc:
7371 case CCValAssign::BCvt:
7372 MemVT = VA.getLocVT();
7373 break;
7376 Subtarget->isWindowsArm64EC()) &&
7377 "Indirect arguments should be scalable on most subtargets");
7378 MemVT = VA.getLocVT();
7379 break;
7380 case CCValAssign::SExt:
7381 ExtType = ISD::SEXTLOAD;
7382 break;
7383 case CCValAssign::ZExt:
7384 ExtType = ISD::ZEXTLOAD;
7385 break;
7386 case CCValAssign::AExt:
7387 ExtType = ISD::EXTLOAD;
7388 break;
7389 }
7390
7391 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7392 MemVT);
7393 }
7394
7395 if (VA.getLocInfo() == CCValAssign::Indirect) {
7396 assert((VA.getValVT().isScalableVT() ||
7397 Subtarget->isWindowsArm64EC()) &&
7398 "Indirect arguments should be scalable on most subtargets");
7399
7400 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7401 unsigned NumParts = 1;
7402 if (Ins[i].Flags.isInConsecutiveRegs()) {
7403 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7404 ++NumParts;
7405 }
7406
7407 MVT PartLoad = VA.getValVT();
7408 SDValue Ptr = ArgValue;
7409
7410 // Ensure we generate all loads for each tuple part, whilst updating the
7411 // pointer after each load correctly using vscale.
7412 while (NumParts > 0) {
7413 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7414 InVals.push_back(ArgValue);
7415 NumParts--;
7416 if (NumParts > 0) {
7417 SDValue BytesIncrement;
7418 if (PartLoad.isScalableVector()) {
7419 BytesIncrement = DAG.getVScale(
7420 DL, Ptr.getValueType(),
7421 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7422 } else {
7423 BytesIncrement = DAG.getConstant(
7424 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7425 Ptr.getValueType());
7426 }
7428 Flags.setNoUnsignedWrap(true);
7429 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7430 BytesIncrement, Flags);
7431 ExtraArgLocs++;
7432 i++;
7433 }
7434 }
7435 } else {
7436 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7437 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7438 ArgValue, DAG.getValueType(MVT::i32));
7439
7440 // i1 arguments are zero-extended to i8 by the caller. Emit a
7441 // hint to reflect this.
7442 if (Ins[i].isOrigArg()) {
7443 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7444 if (OrigArg->getType()->isIntegerTy(1)) {
7445 if (!Ins[i].Flags.isZExt()) {
7446 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7447 ArgValue.getValueType(), ArgValue);
7448 }
7449 }
7450 }
7451
7452 InVals.push_back(ArgValue);
7453 }
7454 }
7455 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7456
7457 // Insert the SMSTART if this is a locally streaming function and
7458 // make sure it is Glued to the last CopyFromReg value.
7459 if (IsLocallyStreaming) {
7460 SDValue PStateSM;
7461 if (Attrs.hasStreamingCompatibleInterface()) {
7462 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7465 FuncInfo->setPStateSMReg(Reg);
7466 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7467 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7469 } else
7470 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7472
7473 // Ensure that the SMSTART happens after the CopyWithChain such that its
7474 // chain result is used.
7475 for (unsigned I=0; I<InVals.size(); ++I) {
7477 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7478 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7479 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7480 InVals[I].getValueType());
7481 }
7482 }
7483
7484 // varargs
7485 if (isVarArg) {
7486 if (!Subtarget->isTargetDarwin() || IsWin64) {
7487 // The AAPCS variadic function ABI is identical to the non-variadic
7488 // one. As a result there may be more arguments in registers and we should
7489 // save them for future reference.
7490 // Win64 variadic functions also pass arguments in registers, but all float
7491 // arguments are passed in integer registers.
7492 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7493 }
7494
7495 // This will point to the next argument passed via stack.
7496 unsigned VarArgsOffset = CCInfo.getStackSize();
7497 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7498 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7499 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7500 FuncInfo->setVarArgsStackIndex(
7501 MFI.CreateFixedObject(4, VarArgsOffset, true));
7502
7503 if (MFI.hasMustTailInVarArgFunc()) {
7504 SmallVector<MVT, 2> RegParmTypes;
7505 RegParmTypes.push_back(MVT::i64);
7506 RegParmTypes.push_back(MVT::f128);
7507 // Compute the set of forwarded registers. The rest are scratch.
7509 FuncInfo->getForwardedMustTailRegParms();
7510 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7512
7513 // Conservatively forward X8, since it might be used for aggregate return.
7514 if (!CCInfo.isAllocated(AArch64::X8)) {
7515 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7516 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7517 }
7518 }
7519 }
7520
7521 // On Windows, InReg pointers must be returned, so record the pointer in a
7522 // virtual register at the start of the function so it can be returned in the
7523 // epilogue.
7524 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7525 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7526 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7527 Ins[I].Flags.isInReg()) &&
7528 Ins[I].Flags.isSRet()) {
7529 assert(!FuncInfo->getSRetReturnReg());
7530
7531 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7532 Register Reg =
7534 FuncInfo->setSRetReturnReg(Reg);
7535
7536 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7537 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7538 break;
7539 }
7540 }
7541 }
7542
7543 unsigned StackArgSize = CCInfo.getStackSize();
7544 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7545 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7546 // This is a non-standard ABI so by fiat I say we're allowed to make full
7547 // use of the stack area to be popped, which must be aligned to 16 bytes in
7548 // any case:
7549 StackArgSize = alignTo(StackArgSize, 16);
7550
7551 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7552 // a multiple of 16.
7553 FuncInfo->setArgumentStackToRestore(StackArgSize);
7554
7555 // This realignment carries over to the available bytes below. Our own
7556 // callers will guarantee the space is free by giving an aligned value to
7557 // CALLSEQ_START.
7558 }
7559 // Even if we're not expected to free up the space, it's useful to know how
7560 // much is there while considering tail calls (because we can reuse it).
7561 FuncInfo->setBytesInStackArgArea(StackArgSize);
7562
7563 if (Subtarget->hasCustomCallingConv())
7565
7566 // Create a 16 Byte TPIDR2 object. The dynamic buffer
7567 // will be expanded and stored in the static object later using a pseudonode.
7568 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7569 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
7570 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
7571 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7572 DAG.getConstant(1, DL, MVT::i32));
7573
7574 SDValue Buffer;
7575 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
7577 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
7578 } else {
7579 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
7580 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
7581 DAG.getVTList(MVT::i64, MVT::Other),
7582 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
7583 MFI.CreateVariableSizedObject(Align(16), nullptr);
7584 }
7585 Chain = DAG.getNode(
7586 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
7587 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
7588 }
7589
7590 if (CallConv == CallingConv::PreserveNone) {
7591 for (const ISD::InputArg &I : Ins) {
7592 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
7593 I.Flags.isSwiftAsync()) {
7596 MF.getFunction(),
7597 "Swift attributes can't be used with preserve_none",
7598 DL.getDebugLoc()));
7599 break;
7600 }
7601 }
7602 }
7603
7604 return Chain;
7605}
7606
7607void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7608 SelectionDAG &DAG,
7609 const SDLoc &DL,
7610 SDValue &Chain) const {
7612 MachineFrameInfo &MFI = MF.getFrameInfo();
7614 auto PtrVT = getPointerTy(DAG.getDataLayout());
7615 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7616
7618
7620 unsigned NumGPRArgRegs = GPRArgRegs.size();
7621 if (Subtarget->isWindowsArm64EC()) {
7622 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7623 // functions.
7624 NumGPRArgRegs = 4;
7625 }
7626 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7627
7628 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7629 int GPRIdx = 0;
7630 if (GPRSaveSize != 0) {
7631 if (IsWin64) {
7632 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7633 if (GPRSaveSize & 15)
7634 // The extra size here, if triggered, will always be 8.
7635 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7636 } else
7637 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7638
7639 SDValue FIN;
7640 if (Subtarget->isWindowsArm64EC()) {
7641 // With the Arm64EC ABI, we reserve the save area as usual, but we
7642 // compute its address relative to x4. For a normal AArch64->AArch64
7643 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7644 // different address.
7645 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7646 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7647 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7648 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7649 } else {
7650 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7651 }
7652
7653 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7654 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7655 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7656 SDValue Store =
7657 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7659 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7660 : MachinePointerInfo::getStack(MF, i * 8));
7661 MemOps.push_back(Store);
7662 FIN =
7663 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7664 }
7665 }
7666 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7667 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7668
7669 if (Subtarget->hasFPARMv8() && !IsWin64) {
7671 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7672 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7673
7674 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7675 int FPRIdx = 0;
7676 if (FPRSaveSize != 0) {
7677 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7678
7679 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7680
7681 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7682 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7683 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7684
7685 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7686 MachinePointerInfo::getStack(MF, i * 16));
7687 MemOps.push_back(Store);
7688 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7689 DAG.getConstant(16, DL, PtrVT));
7690 }
7691 }
7692 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7693 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7694 }
7695
7696 if (!MemOps.empty()) {
7697 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7698 }
7699}
7700
7701/// LowerCallResult - Lower the result values of a call into the
7702/// appropriate copies out of appropriate physical registers.
7703SDValue AArch64TargetLowering::LowerCallResult(
7704 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7705 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7706 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7707 SDValue ThisVal, bool RequiresSMChange) const {
7708 DenseMap<unsigned, SDValue> CopiedRegs;
7709 // Copy all of the result registers out of their specified physreg.
7710 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7711 CCValAssign VA = RVLocs[i];
7712
7713 // Pass 'this' value directly from the argument to return value, to avoid
7714 // reg unit interference
7715 if (i == 0 && isThisReturn) {
7716 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7717 "unexpected return calling convention register assignment");
7718 InVals.push_back(ThisVal);
7719 continue;
7720 }
7721
7722 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7723 // allows one use of a physreg per block.
7724 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7725 if (!Val) {
7726 Val =
7727 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7728 Chain = Val.getValue(1);
7729 InGlue = Val.getValue(2);
7730 CopiedRegs[VA.getLocReg()] = Val;
7731 }
7732
7733 switch (VA.getLocInfo()) {
7734 default:
7735 llvm_unreachable("Unknown loc info!");
7736 case CCValAssign::Full:
7737 break;
7738 case CCValAssign::BCvt:
7739 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7740 break;
7742 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7743 DAG.getConstant(32, DL, VA.getLocVT()));
7744 [[fallthrough]];
7745 case CCValAssign::AExt:
7746 [[fallthrough]];
7747 case CCValAssign::ZExt:
7748 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7749 break;
7750 }
7751
7752 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7754 Val);
7755
7756 InVals.push_back(Val);
7757 }
7758
7759 return Chain;
7760}
7761
7762/// Return true if the calling convention is one that we can guarantee TCO for.
7763static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7764 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7766}
7767
7768/// Return true if we might ever do TCO for calls with this calling convention.
7770 switch (CC) {
7771 case CallingConv::C:
7776 case CallingConv::Swift:
7778 case CallingConv::Tail:
7779 case CallingConv::Fast:
7780 return true;
7781 default:
7782 return false;
7783 }
7784}
7785
7787 const AArch64Subtarget *Subtarget,
7789 CCState &CCInfo) {
7790 const SelectionDAG &DAG = CLI.DAG;
7791 CallingConv::ID CalleeCC = CLI.CallConv;
7792 bool IsVarArg = CLI.IsVarArg;
7793 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7794 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7795
7796 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7797 // for the shadow store.
7798 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7799 CCInfo.AllocateStack(32, Align(16));
7800
7801 unsigned NumArgs = Outs.size();
7802 for (unsigned i = 0; i != NumArgs; ++i) {
7803 MVT ArgVT = Outs[i].VT;
7804 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7805
7806 bool UseVarArgCC = false;
7807 if (IsVarArg) {
7808 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7809 // too, so use the vararg CC to force them to integer registers.
7810 if (IsCalleeWin64) {
7811 UseVarArgCC = true;
7812 } else {
7813 UseVarArgCC = !Outs[i].IsFixed;
7814 }
7815 }
7816
7817 if (!UseVarArgCC) {
7818 // Get type of the original argument.
7819 EVT ActualVT =
7820 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7821 /*AllowUnknown*/ true);
7822 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7823 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7824 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7825 ArgVT = MVT::i8;
7826 else if (ActualMVT == MVT::i16)
7827 ArgVT = MVT::i16;
7828 }
7829
7830 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7831 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7832 assert(!Res && "Call operand has unhandled type");
7833 (void)Res;
7834 }
7835}
7836
7837bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7838 const CallLoweringInfo &CLI) const {
7839 CallingConv::ID CalleeCC = CLI.CallConv;
7840 if (!mayTailCallThisCC(CalleeCC))
7841 return false;
7842
7843 SDValue Callee = CLI.Callee;
7844 bool IsVarArg = CLI.IsVarArg;
7845 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7846 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7847 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7848 const SelectionDAG &DAG = CLI.DAG;
7850 const Function &CallerF = MF.getFunction();
7851 CallingConv::ID CallerCC = CallerF.getCallingConv();
7852
7853 // SME Streaming functions are not eligible for TCO as they may require
7854 // the streaming mode or ZA to be restored after returning from the call.
7855 SMEAttrs CallerAttrs(MF.getFunction());
7856 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7857 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7858 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7859 CallerAttrs.hasStreamingBody())
7860 return false;
7861
7862 // Functions using the C or Fast calling convention that have an SVE signature
7863 // preserve more registers and should assume the SVE_VectorCall CC.
7864 // The check for matching callee-saved regs will determine whether it is
7865 // eligible for TCO.
7866 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7869
7870 bool CCMatch = CallerCC == CalleeCC;
7871
7872 // When using the Windows calling convention on a non-windows OS, we want
7873 // to back up and restore X18 in such functions; we can't do a tail call
7874 // from those functions.
7875 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7876 CalleeCC != CallingConv::Win64)
7877 return false;
7878
7879 // Byval parameters hand the function a pointer directly into the stack area
7880 // we want to reuse during a tail call. Working around this *is* possible (see
7881 // X86) but less efficient and uglier in LowerCall.
7882 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7883 e = CallerF.arg_end();
7884 i != e; ++i) {
7885 if (i->hasByValAttr())
7886 return false;
7887
7888 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7889 // In this case, it is necessary to save/restore X0 in the callee. Tail
7890 // call opt interferes with this. So we disable tail call opt when the
7891 // caller has an argument with "inreg" attribute.
7892
7893 // FIXME: Check whether the callee also has an "inreg" argument.
7894 if (i->hasInRegAttr())
7895 return false;
7896 }
7897
7898 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7899 return CCMatch;
7900
7901 // Externally-defined functions with weak linkage should not be
7902 // tail-called on AArch64 when the OS does not support dynamic
7903 // pre-emption of symbols, as the AAELF spec requires normal calls
7904 // to undefined weak functions to be replaced with a NOP or jump to the
7905 // next instruction. The behaviour of branch instructions in this
7906 // situation (as used for tail calls) is implementation-defined, so we
7907 // cannot rely on the linker replacing the tail call with a return.
7908 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7909 const GlobalValue *GV = G->getGlobal();
7911 if (GV->hasExternalWeakLinkage() &&
7912 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7913 return false;
7914 }
7915
7916 // Now we search for cases where we can use a tail call without changing the
7917 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7918 // concept.
7919
7920 // I want anyone implementing a new calling convention to think long and hard
7921 // about this assert.
7922 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7923 "Unexpected variadic calling convention");
7924
7925 LLVMContext &C = *DAG.getContext();
7926 // Check that the call results are passed in the same way.
7927 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7928 CCAssignFnForCall(CalleeCC, IsVarArg),
7929 CCAssignFnForCall(CallerCC, IsVarArg)))
7930 return false;
7931 // The callee has to preserve all registers the caller needs to preserve.
7932 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7933 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7934 if (!CCMatch) {
7935 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7936 if (Subtarget->hasCustomCallingConv()) {
7937 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7938 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7939 }
7940 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7941 return false;
7942 }
7943
7944 // Nothing more to check if the callee is taking no arguments
7945 if (Outs.empty())
7946 return true;
7947
7949 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7950
7951 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7952
7953 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7954 // When we are musttail, additional checks have been done and we can safely ignore this check
7955 // At least two cases here: if caller is fastcc then we can't have any
7956 // memory arguments (we'd be expected to clean up the stack afterwards). If
7957 // caller is C then we could potentially use its argument area.
7958
7959 // FIXME: for now we take the most conservative of these in both cases:
7960 // disallow all variadic memory operands.
7961 for (const CCValAssign &ArgLoc : ArgLocs)
7962 if (!ArgLoc.isRegLoc())
7963 return false;
7964 }
7965
7966 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7967
7968 // If any of the arguments is passed indirectly, it must be SVE, so the
7969 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7970 // allocate space on the stack. That is why we determine this explicitly here
7971 // the call cannot be a tailcall.
7972 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7973 assert((A.getLocInfo() != CCValAssign::Indirect ||
7974 A.getValVT().isScalableVector() ||
7975 Subtarget->isWindowsArm64EC()) &&
7976 "Expected value to be scalable");
7977 return A.getLocInfo() == CCValAssign::Indirect;
7978 }))
7979 return false;
7980
7981 // If the stack arguments for this call do not fit into our own save area then
7982 // the call cannot be made tail.
7983 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7984 return false;
7985
7986 const MachineRegisterInfo &MRI = MF.getRegInfo();
7987 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7988 return false;
7989
7990 return true;
7991}
7992
7993SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7994 SelectionDAG &DAG,
7995 MachineFrameInfo &MFI,
7996 int ClobberedFI) const {
7997 SmallVector<SDValue, 8> ArgChains;
7998 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7999 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8000
8001 // Include the original chain at the beginning of the list. When this is
8002 // used by target LowerCall hooks, this helps legalize find the
8003 // CALLSEQ_BEGIN node.
8004 ArgChains.push_back(Chain);
8005
8006 // Add a chain value for each stack argument corresponding
8007 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
8008 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8009 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8010 if (FI->getIndex() < 0) {
8011 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8012 int64_t InLastByte = InFirstByte;
8013 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8014
8015 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8016 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8017 ArgChains.push_back(SDValue(L, 1));
8018 }
8019
8020 // Build a tokenfactor for all the chains.
8021 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8022}
8023
8024bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8025 bool TailCallOpt) const {
8026 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8027 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8028}
8029
8030// Check if the value is zero-extended from i1 to i8
8031static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8032 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8033 if (SizeInBits < 8)
8034 return false;
8035
8036 APInt RequredZero(SizeInBits, 0xFE);
8037 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8038 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8039 return ZExtBool;
8040}
8041
8042void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8043 SDNode *Node) const {
8044 // Live-in physreg copies that are glued to SMSTART are applied as
8045 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8046 // register allocator to pass call args in callee saved regs, without extra
8047 // copies to avoid these fake clobbers of actually-preserved GPRs.
8048 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8049 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8050 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8051 if (MachineOperand &MO = MI.getOperand(I);
8052 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8053 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8054 AArch64::GPR64RegClass.contains(MO.getReg())))
8055 MI.removeOperand(I);
8056
8057 // The SVE vector length can change when entering/leaving streaming mode.
8058 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8059 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8060 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8061 /*IsImplicit=*/true));
8062 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8063 /*IsImplicit=*/true));
8064 }
8065 }
8066
8067 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8068 // have nothing to do with VG, were it not that they are used to materialise a
8069 // frame-address. If they contain a frame-index to a scalable vector, this
8070 // will likely require an ADDVL instruction to materialise the address, thus
8071 // reading VG.
8072 const MachineFunction &MF = *MI.getMF();
8074 (MI.getOpcode() == AArch64::ADDXri ||
8075 MI.getOpcode() == AArch64::SUBXri)) {
8076 const MachineOperand &MO = MI.getOperand(1);
8077 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8079 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8080 /*IsImplicit=*/true));
8081 }
8082}
8083
8085 bool Enable, SDValue Chain,
8086 SDValue InGlue,
8087 unsigned Condition,
8088 SDValue PStateSM) const {
8091 FuncInfo->setHasStreamingModeChanges(true);
8092
8093 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8094 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8095 SDValue MSROp =
8096 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8097 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8098 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8099 if (Condition != AArch64SME::Always) {
8100 assert(PStateSM && "PStateSM should be defined");
8101 Ops.push_back(PStateSM);
8102 }
8103 Ops.push_back(RegMask);
8104
8105 if (InGlue)
8106 Ops.push_back(InGlue);
8107
8108 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8109 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8110}
8111
8112static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8113 const SMEAttrs &CalleeAttrs) {
8114 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8115 CallerAttrs.hasStreamingBody())
8116 return AArch64SME::Always;
8117 if (CalleeAttrs.hasNonStreamingInterface())
8119 if (CalleeAttrs.hasStreamingInterface())
8121
8122 llvm_unreachable("Unsupported attributes");
8123}
8124
8125/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8126/// and add input and output parameter nodes.
8127SDValue
8128AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8129 SmallVectorImpl<SDValue> &InVals) const {
8130 SelectionDAG &DAG = CLI.DAG;
8131 SDLoc &DL = CLI.DL;
8132 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8133 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8135 SDValue Chain = CLI.Chain;
8136 SDValue Callee = CLI.Callee;
8137 bool &IsTailCall = CLI.IsTailCall;
8138 CallingConv::ID &CallConv = CLI.CallConv;
8139 bool IsVarArg = CLI.IsVarArg;
8140
8143 bool IsThisReturn = false;
8144
8146 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8147 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8148 bool IsSibCall = false;
8149 bool GuardWithBTI = false;
8150
8151 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8152 !Subtarget->noBTIAtReturnTwice()) {
8153 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8154 }
8155
8156 // Analyze operands of the call, assigning locations to each operand.
8158 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8159
8160 if (IsVarArg) {
8161 unsigned NumArgs = Outs.size();
8162
8163 for (unsigned i = 0; i != NumArgs; ++i) {
8164 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8165 report_fatal_error("Passing SVE types to variadic functions is "
8166 "currently not supported");
8167 }
8168 }
8169
8170 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8171
8172 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8173 // Assign locations to each value returned by this call.
8175 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8176 *DAG.getContext());
8177 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8178
8179 // Check callee args/returns for SVE registers and set calling convention
8180 // accordingly.
8181 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8182 auto HasSVERegLoc = [](CCValAssign &Loc) {
8183 if (!Loc.isRegLoc())
8184 return false;
8185 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8186 AArch64::PPRRegClass.contains(Loc.getLocReg());
8187 };
8188 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8190 }
8191
8192 if (IsTailCall) {
8193 // Check if it's really possible to do a tail call.
8194 IsTailCall = isEligibleForTailCallOptimization(CLI);
8195
8196 // A sibling call is one where we're under the usual C ABI and not planning
8197 // to change that but can still do a tail call:
8198 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8199 CallConv != CallingConv::SwiftTail)
8200 IsSibCall = true;
8201
8202 if (IsTailCall)
8203 ++NumTailCalls;
8204 }
8205
8206 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8207 report_fatal_error("failed to perform tail call elimination on a call "
8208 "site marked musttail");
8209
8210 // Get a count of how many bytes are to be pushed on the stack.
8211 unsigned NumBytes = CCInfo.getStackSize();
8212
8213 if (IsSibCall) {
8214 // Since we're not changing the ABI to make this a tail call, the memory
8215 // operands are already available in the caller's incoming argument space.
8216 NumBytes = 0;
8217 }
8218
8219 // FPDiff is the byte offset of the call's argument area from the callee's.
8220 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8221 // by this amount for a tail call. In a sibling call it must be 0 because the
8222 // caller will deallocate the entire stack and the callee still expects its
8223 // arguments to begin at SP+0. Completely unused for non-tail calls.
8224 int FPDiff = 0;
8225
8226 if (IsTailCall && !IsSibCall) {
8227 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8228
8229 // Since callee will pop argument stack as a tail call, we must keep the
8230 // popped size 16-byte aligned.
8231 NumBytes = alignTo(NumBytes, 16);
8232
8233 // FPDiff will be negative if this tail call requires more space than we
8234 // would automatically have in our incoming argument space. Positive if we
8235 // can actually shrink the stack.
8236 FPDiff = NumReusableBytes - NumBytes;
8237
8238 // Update the required reserved area if this is the tail call requiring the
8239 // most argument stack space.
8240 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8241 FuncInfo->setTailCallReservedStack(-FPDiff);
8242
8243 // The stack pointer must be 16-byte aligned at all times it's used for a
8244 // memory operation, which in practice means at *all* times and in
8245 // particular across call boundaries. Therefore our own arguments started at
8246 // a 16-byte aligned SP and the delta applied for the tail call should
8247 // satisfy the same constraint.
8248 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8249 }
8250
8251 // Determine whether we need any streaming mode changes.
8252 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8253 if (CLI.CB)
8254 CalleeAttrs = SMEAttrs(*CLI.CB);
8255 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8256 CalleeAttrs = SMEAttrs(ES->getSymbol());
8257
8258 auto DescribeCallsite =
8260 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8261 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8262 R << ore::NV("Callee", ES->getSymbol());
8263 else if (CLI.CB && CLI.CB->getCalledFunction())
8264 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8265 else
8266 R << "unknown callee";
8267 R << "'";
8268 return R;
8269 };
8270
8271 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8272 if (RequiresLazySave) {
8273 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8274 MachinePointerInfo MPI =
8276 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8277 TPIDR2.FrameIndex,
8279 SDValue NumZaSaveSlicesAddr =
8280 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8281 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8282 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8283 DAG.getConstant(1, DL, MVT::i32));
8284 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8285 MPI, MVT::i16);
8286 Chain = DAG.getNode(
8287 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8288 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8289 TPIDR2ObjAddr);
8291 ORE.emit([&]() {
8292 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8293 CLI.CB)
8294 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8295 &MF.getFunction());
8296 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8297 });
8298 }
8299
8300 SDValue PStateSM;
8301 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8302 if (RequiresSMChange) {
8303 if (CallerAttrs.hasStreamingInterfaceOrBody())
8304 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8305 else if (CallerAttrs.hasNonStreamingInterface())
8306 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8307 else
8308 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8310 ORE.emit([&]() {
8311 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8312 CLI.CB)
8313 : OptimizationRemarkAnalysis("sme", "SMETransition",
8314 &MF.getFunction());
8315 DescribeCallsite(R) << " requires a streaming mode transition";
8316 return R;
8317 });
8318 }
8319
8320 SDValue ZTFrameIdx;
8321 MachineFrameInfo &MFI = MF.getFrameInfo();
8322 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8323
8324 // If the caller has ZT0 state which will not be preserved by the callee,
8325 // spill ZT0 before the call.
8326 if (ShouldPreserveZT0) {
8327 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8328 ZTFrameIdx = DAG.getFrameIndex(
8329 ZTObj,
8331
8332 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8333 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8334 }
8335
8336 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8337 // PSTATE.ZA before the call if there is no lazy-save active.
8338 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8339 assert((!DisableZA || !RequiresLazySave) &&
8340 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8341
8342 if (DisableZA)
8343 Chain = DAG.getNode(
8344 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8345 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8346 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8347
8348 // Adjust the stack pointer for the new arguments...
8349 // These operations are automatically eliminated by the prolog/epilog pass
8350 if (!IsSibCall)
8351 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8352
8353 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8355
8357 SmallSet<unsigned, 8> RegsUsed;
8358 SmallVector<SDValue, 8> MemOpChains;
8359 auto PtrVT = getPointerTy(DAG.getDataLayout());
8360
8361 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8362 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8363 for (const auto &F : Forwards) {
8364 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8365 RegsToPass.emplace_back(F.PReg, Val);
8366 }
8367 }
8368
8369 // Walk the register/memloc assignments, inserting copies/loads.
8370 unsigned ExtraArgLocs = 0;
8371 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8372 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8373 SDValue Arg = OutVals[i];
8374 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8375
8376 // Promote the value if needed.
8377 switch (VA.getLocInfo()) {
8378 default:
8379 llvm_unreachable("Unknown loc info!");
8380 case CCValAssign::Full:
8381 break;
8382 case CCValAssign::SExt:
8383 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8384 break;
8385 case CCValAssign::ZExt:
8386 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8387 break;
8388 case CCValAssign::AExt:
8389 if (Outs[i].ArgVT == MVT::i1) {
8390 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8391 //
8392 // Check if we actually have to do this, because the value may
8393 // already be zero-extended.
8394 //
8395 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8396 // and rely on DAGCombiner to fold this, because the following
8397 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8398 //
8399 // (ext (zext x)) -> (zext x)
8400 //
8401 // This will give us (zext i32), which we cannot remove, so
8402 // try to check this beforehand.
8403 if (!checkZExtBool(Arg, DAG)) {
8404 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8405 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8406 }
8407 }
8408 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8409 break;
8411 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8412 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8413 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8414 DAG.getConstant(32, DL, VA.getLocVT()));
8415 break;
8416 case CCValAssign::BCvt:
8417 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8418 break;
8419 case CCValAssign::Trunc:
8420 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8421 break;
8422 case CCValAssign::FPExt:
8423 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8424 break;
8426 bool isScalable = VA.getValVT().isScalableVT();
8427 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8428 "Indirect arguments should be scalable on most subtargets");
8429
8430 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8431 uint64_t PartSize = StoreSize;
8432 unsigned NumParts = 1;
8433 if (Outs[i].Flags.isInConsecutiveRegs()) {
8434 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8435 ++NumParts;
8436 StoreSize *= NumParts;
8437 }
8438
8439 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8440 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8441 MachineFrameInfo &MFI = MF.getFrameInfo();
8442 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8443 if (isScalable)
8445
8449 SDValue SpillSlot = Ptr;
8450
8451 // Ensure we generate all stores for each tuple part, whilst updating the
8452 // pointer after each store correctly using vscale.
8453 while (NumParts) {
8454 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8455 MemOpChains.push_back(Store);
8456
8457 NumParts--;
8458 if (NumParts > 0) {
8459 SDValue BytesIncrement;
8460 if (isScalable) {
8461 BytesIncrement = DAG.getVScale(
8462 DL, Ptr.getValueType(),
8463 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8464 } else {
8465 BytesIncrement = DAG.getConstant(
8466 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8467 Ptr.getValueType());
8468 }
8470 Flags.setNoUnsignedWrap(true);
8471
8472 MPI = MachinePointerInfo(MPI.getAddrSpace());
8473 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8474 BytesIncrement, Flags);
8475 ExtraArgLocs++;
8476 i++;
8477 }
8478 }
8479
8480 Arg = SpillSlot;
8481 break;
8482 }
8483
8484 if (VA.isRegLoc()) {
8485 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8486 Outs[0].VT == MVT::i64) {
8487 assert(VA.getLocVT() == MVT::i64 &&
8488 "unexpected calling convention register assignment");
8489 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8490 "unexpected use of 'returned'");
8491 IsThisReturn = true;
8492 }
8493 if (RegsUsed.count(VA.getLocReg())) {
8494 // If this register has already been used then we're trying to pack
8495 // parts of an [N x i32] into an X-register. The extension type will
8496 // take care of putting the two halves in the right place but we have to
8497 // combine them.
8498 SDValue &Bits =
8499 llvm::find_if(RegsToPass,
8500 [=](const std::pair<unsigned, SDValue> &Elt) {
8501 return Elt.first == VA.getLocReg();
8502 })
8503 ->second;
8504 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8505 // Call site info is used for function's parameter entry value
8506 // tracking. For now we track only simple cases when parameter
8507 // is transferred through whole register.
8509 [&VA](MachineFunction::ArgRegPair ArgReg) {
8510 return ArgReg.Reg == VA.getLocReg();
8511 });
8512 } else {
8513 // Add an extra level of indirection for streaming mode changes by
8514 // using a pseudo copy node that cannot be rematerialised between a
8515 // smstart/smstop and the call by the simple register coalescer.
8516 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8518 Arg.getValueType(), Arg);
8519 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8520 RegsUsed.insert(VA.getLocReg());
8521 const TargetOptions &Options = DAG.getTarget().Options;
8522 if (Options.EmitCallSiteInfo)
8523 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8524 }
8525 } else {
8526 assert(VA.isMemLoc());
8527
8528 SDValue DstAddr;
8529 MachinePointerInfo DstInfo;
8530
8531 // FIXME: This works on big-endian for composite byvals, which are the
8532 // common case. It should also work for fundamental types too.
8533 uint32_t BEAlign = 0;
8534 unsigned OpSize;
8535 if (VA.getLocInfo() == CCValAssign::Indirect ||
8537 OpSize = VA.getLocVT().getFixedSizeInBits();
8538 else
8539 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8540 : VA.getValVT().getSizeInBits();
8541 OpSize = (OpSize + 7) / 8;
8542 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8543 !Flags.isInConsecutiveRegs()) {
8544 if (OpSize < 8)
8545 BEAlign = 8 - OpSize;
8546 }
8547 unsigned LocMemOffset = VA.getLocMemOffset();
8548 int32_t Offset = LocMemOffset + BEAlign;
8549 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8550 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8551
8552 if (IsTailCall) {
8553 Offset = Offset + FPDiff;
8554 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8555
8556 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8557 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8558
8559 // Make sure any stack arguments overlapping with where we're storing
8560 // are loaded before this eventual operation. Otherwise they'll be
8561 // clobbered.
8562 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8563 } else {
8564 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8565
8566 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8567 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8568 }
8569
8570 if (Outs[i].Flags.isByVal()) {
8571 SDValue SizeNode =
8572 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8573 SDValue Cpy = DAG.getMemcpy(
8574 Chain, DL, DstAddr, Arg, SizeNode,
8575 Outs[i].Flags.getNonZeroByValAlign(),
8576 /*isVol = */ false, /*AlwaysInline = */ false,
8577 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8578
8579 MemOpChains.push_back(Cpy);
8580 } else {
8581 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8582 // promoted to a legal register type i32, we should truncate Arg back to
8583 // i1/i8/i16.
8584 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8585 VA.getValVT() == MVT::i16)
8586 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8587
8588 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8589 MemOpChains.push_back(Store);
8590 }
8591 }
8592 }
8593
8594 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8595 SDValue ParamPtr = StackPtr;
8596 if (IsTailCall) {
8597 // Create a dummy object at the top of the stack that can be used to get
8598 // the SP after the epilogue
8599 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8600 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8601 }
8602
8603 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8604 // describing the argument list. x4 contains the address of the
8605 // first stack parameter. x5 contains the size in bytes of all parameters
8606 // passed on the stack.
8607 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8608 RegsToPass.emplace_back(AArch64::X5,
8609 DAG.getConstant(NumBytes, DL, MVT::i64));
8610 }
8611
8612 if (!MemOpChains.empty())
8613 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8614
8615 SDValue InGlue;
8616 if (RequiresSMChange) {
8617
8618 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
8619 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
8620 InGlue = Chain.getValue(1);
8621
8622 SDValue NewChain = changeStreamingMode(
8623 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8624 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8625 Chain = NewChain.getValue(0);
8626 InGlue = NewChain.getValue(1);
8627 }
8628
8629 // Build a sequence of copy-to-reg nodes chained together with token chain
8630 // and flag operands which copy the outgoing args into the appropriate regs.
8631 for (auto &RegToPass : RegsToPass) {
8632 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8633 RegToPass.second, InGlue);
8634 InGlue = Chain.getValue(1);
8635 }
8636
8637 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8638 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8639 // node so that legalize doesn't hack it.
8640 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8641 auto GV = G->getGlobal();
8642 unsigned OpFlags =
8644 if (OpFlags & AArch64II::MO_GOT) {
8645 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8646 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8647 } else {
8648 const GlobalValue *GV = G->getGlobal();
8649 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8650 }
8651 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8652 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8653 Subtarget->isTargetMachO()) ||
8655 const char *Sym = S->getSymbol();
8656 if (UseGot) {
8658 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8659 } else {
8660 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8661 }
8662 }
8663
8664 // We don't usually want to end the call-sequence here because we would tidy
8665 // the frame up *after* the call, however in the ABI-changing tail-call case
8666 // we've carefully laid out the parameters so that when sp is reset they'll be
8667 // in the correct location.
8668 if (IsTailCall && !IsSibCall) {
8669 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8670 InGlue = Chain.getValue(1);
8671 }
8672
8673 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
8674
8675 std::vector<SDValue> Ops;
8676 Ops.push_back(Chain);
8677 Ops.push_back(Callee);
8678
8679 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8680 // be expanded to the call, directly followed by a special marker sequence and
8681 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8682 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8683 assert(!IsTailCall &&
8684 "tail calls cannot be marked with clang.arc.attachedcall");
8686
8687 // Add a target global address for the retainRV/claimRV runtime function
8688 // just before the call target.
8689 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8690 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8691 Ops.insert(Ops.begin() + 1, GA);
8692 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8694 } else if (GuardWithBTI) {
8696 }
8697
8698 if (IsTailCall) {
8699 // Each tail call may have to adjust the stack by a different amount, so
8700 // this information must travel along with the operation for eventual
8701 // consumption by emitEpilogue.
8702 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8703 }
8704
8705 if (CLI.PAI) {
8706 const uint64_t Key = CLI.PAI->Key;
8707 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
8708 "Invalid auth call key");
8709
8710 // Split the discriminator into address/integer components.
8711 SDValue AddrDisc, IntDisc;
8712 std::tie(IntDisc, AddrDisc) =
8713 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
8714
8715 if (Opc == AArch64ISD::CALL_RVMARKER)
8717 else
8719 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
8720 Ops.push_back(IntDisc);
8721 Ops.push_back(AddrDisc);
8722 }
8723
8724 // Add argument registers to the end of the list so that they are known live
8725 // into the call.
8726 for (auto &RegToPass : RegsToPass)
8727 Ops.push_back(DAG.getRegister(RegToPass.first,
8728 RegToPass.second.getValueType()));
8729
8730 // Add a register mask operand representing the call-preserved registers.
8731 const uint32_t *Mask;
8732 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8733 if (IsThisReturn) {
8734 // For 'this' returns, use the X0-preserving mask if applicable
8735 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8736 if (!Mask) {
8737 IsThisReturn = false;
8738 Mask = TRI->getCallPreservedMask(MF, CallConv);
8739 }
8740 } else
8741 Mask = TRI->getCallPreservedMask(MF, CallConv);
8742
8743 if (Subtarget->hasCustomCallingConv())
8744 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8745
8746 if (TRI->isAnyArgRegReserved(MF))
8747 TRI->emitReservedArgRegCallError(MF);
8748
8749 assert(Mask && "Missing call preserved mask for calling convention");
8750 Ops.push_back(DAG.getRegisterMask(Mask));
8751
8752 if (InGlue.getNode())
8753 Ops.push_back(InGlue);
8754
8755 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8756
8757 // If we're doing a tall call, use a TC_RETURN here rather than an
8758 // actual call instruction.
8759 if (IsTailCall) {
8761 SDValue Ret = DAG.getNode(Opc, DL, NodeTys, Ops);
8762 if (IsCFICall)
8763 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8764
8765 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8766 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8767 return Ret;
8768 }
8769
8770 // Returns a chain and a flag for retval copy to use.
8771 Chain = DAG.getNode(Opc, DL, NodeTys, Ops);
8772 if (IsCFICall)
8773 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8774
8775 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8776 InGlue = Chain.getValue(1);
8777 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8778
8779 uint64_t CalleePopBytes =
8780 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8781
8782 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8783 InGlue = Chain.getValue(1);
8784
8785 // Handle result values, copying them out of physregs into vregs that we
8786 // return.
8787 SDValue Result = LowerCallResult(
8788 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8789 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8790
8791 if (!Ins.empty())
8792 InGlue = Result.getValue(Result->getNumValues() - 1);
8793
8794 if (RequiresSMChange) {
8795 assert(PStateSM && "Expected a PStateSM to be set");
8797 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8798 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8799 InGlue = Result.getValue(1);
8800
8801 Result =
8803 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
8804 }
8805
8806 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8807 // Unconditionally resume ZA.
8808 Result = DAG.getNode(
8809 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8810 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8811 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8812
8813 if (ShouldPreserveZT0)
8814 Result =
8815 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8816 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8817
8818 if (RequiresLazySave) {
8819 // Conditionally restore the lazy save using a pseudo node.
8820 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8821 SDValue RegMask = DAG.getRegisterMask(
8822 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8823 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8824 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8825 SDValue TPIDR2_EL0 = DAG.getNode(
8826 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8827 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8828
8829 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8830 // RESTORE_ZA pseudo.
8831 SDValue Glue;
8832 SDValue TPIDR2Block = DAG.getFrameIndex(
8833 TPIDR2.FrameIndex,
8835 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8836 Result =
8837 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8838 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8839 RestoreRoutine, RegMask, Result.getValue(1)});
8840
8841 // Finally reset the TPIDR2_EL0 register to 0.
8842 Result = DAG.getNode(
8843 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8844 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8845 DAG.getConstant(0, DL, MVT::i64));
8846 TPIDR2.Uses++;
8847 }
8848
8849 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8850 for (unsigned I = 0; I < InVals.size(); ++I) {
8851 // The smstart/smstop is chained as part of the call, but when the
8852 // resulting chain is discarded (which happens when the call is not part
8853 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8854 // smstart/smstop is chained to the result value. We can do that by doing
8855 // a vreg -> vreg copy.
8857 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8858 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8859 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8860 InVals[I].getValueType());
8861 }
8862 }
8863
8864 if (CallConv == CallingConv::PreserveNone) {
8865 for (const ISD::OutputArg &O : Outs) {
8866 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
8867 O.Flags.isSwiftAsync()) {
8870 MF.getFunction(),
8871 "Swift attributes can't be used with preserve_none",
8872 DL.getDebugLoc()));
8873 break;
8874 }
8875 }
8876 }
8877
8878 return Result;
8879}
8880
8881bool AArch64TargetLowering::CanLowerReturn(
8882 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8883 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8884 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8886 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8887 return CCInfo.CheckReturn(Outs, RetCC);
8888}
8889
8890SDValue
8891AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8892 bool isVarArg,
8894 const SmallVectorImpl<SDValue> &OutVals,
8895 const SDLoc &DL, SelectionDAG &DAG) const {
8896 auto &MF = DAG.getMachineFunction();
8897 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8898
8899 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8901 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8902 CCInfo.AnalyzeReturn(Outs, RetCC);
8903
8904 // Copy the result values into the output registers.
8905 SDValue Glue;
8907 SmallSet<unsigned, 4> RegsUsed;
8908 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8909 ++i, ++realRVLocIdx) {
8910 CCValAssign &VA = RVLocs[i];
8911 assert(VA.isRegLoc() && "Can only return in registers!");
8912 SDValue Arg = OutVals[realRVLocIdx];
8913
8914 switch (VA.getLocInfo()) {
8915 default:
8916 llvm_unreachable("Unknown loc info!");
8917 case CCValAssign::Full:
8918 if (Outs[i].ArgVT == MVT::i1) {
8919 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8920 // value. This is strictly redundant on Darwin (which uses "zeroext
8921 // i1"), but will be optimised out before ISel.
8922 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8923 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8924 }
8925 break;
8926 case CCValAssign::BCvt:
8927 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8928 break;
8929 case CCValAssign::AExt:
8930 case CCValAssign::ZExt:
8931 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8932 break;
8934 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8935 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8936 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8937 DAG.getConstant(32, DL, VA.getLocVT()));
8938 break;
8939 }
8940
8941 if (RegsUsed.count(VA.getLocReg())) {
8942 SDValue &Bits =
8943 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8944 return Elt.first == VA.getLocReg();
8945 })->second;
8946 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8947 } else {
8948 RetVals.emplace_back(VA.getLocReg(), Arg);
8949 RegsUsed.insert(VA.getLocReg());
8950 }
8951 }
8952
8953 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8954
8955 // Emit SMSTOP before returning from a locally streaming function
8956 SMEAttrs FuncAttrs(MF.getFunction());
8957 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8958 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8959 Register Reg = FuncInfo->getPStateSMReg();
8960 assert(Reg.isValid() && "PStateSM Register is invalid");
8961 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8962 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8963 /*Glue*/ SDValue(),
8965 } else
8966 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8967 /*Glue*/ SDValue(), AArch64SME::Always);
8968 Glue = Chain.getValue(1);
8969 }
8970
8971 SmallVector<SDValue, 4> RetOps(1, Chain);
8972 for (auto &RetVal : RetVals) {
8973 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8974 isPassedInFPR(RetVal.second.getValueType()))
8975 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8976 RetVal.second.getValueType(), RetVal.second);
8977 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8978 Glue = Chain.getValue(1);
8979 RetOps.push_back(
8980 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8981 }
8982
8983 // Windows AArch64 ABIs require that for returning structs by value we copy
8984 // the sret argument into X0 for the return.
8985 // We saved the argument into a virtual register in the entry block,
8986 // so now we copy the value out and into X0.
8987 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8988 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8990
8991 unsigned RetValReg = AArch64::X0;
8992 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8993 RetValReg = AArch64::X8;
8994 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8995 Glue = Chain.getValue(1);
8996
8997 RetOps.push_back(
8998 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8999 }
9000
9001 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9002 if (I) {
9003 for (; *I; ++I) {
9004 if (AArch64::GPR64RegClass.contains(*I))
9005 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9006 else if (AArch64::FPR64RegClass.contains(*I))
9007 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9008 else
9009 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9010 }
9011 }
9012
9013 RetOps[0] = Chain; // Update chain.
9014
9015 // Add the glue if we have it.
9016 if (Glue.getNode())
9017 RetOps.push_back(Glue);
9018
9019 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9020 // ARM64EC entry thunks use a special return sequence: instead of a regular
9021 // "ret" instruction, they need to explicitly call the emulator.
9022 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9023 SDValue Arm64ECRetDest =
9024 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9025 Arm64ECRetDest =
9026 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9027 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9029 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9030 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9031 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9032 }
9033
9034 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9035}
9036
9037//===----------------------------------------------------------------------===//
9038// Other Lowering Code
9039//===----------------------------------------------------------------------===//
9040
9041SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9042 SelectionDAG &DAG,
9043 unsigned Flag) const {
9044 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9045 N->getOffset(), Flag);
9046}
9047
9048SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9049 SelectionDAG &DAG,
9050 unsigned Flag) const {
9051 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9052}
9053
9054SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9055 SelectionDAG &DAG,
9056 unsigned Flag) const {
9057 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9058 N->getOffset(), Flag);
9059}
9060
9061SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9062 SelectionDAG &DAG,
9063 unsigned Flag) const {
9064 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9065}
9066
9067SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9068 SelectionDAG &DAG,
9069 unsigned Flag) const {
9070 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9071}
9072
9073// (loadGOT sym)
9074template <class NodeTy>
9075SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9076 unsigned Flags) const {
9077 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9078 SDLoc DL(N);
9079 EVT Ty = getPointerTy(DAG.getDataLayout());
9080 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9081 // FIXME: Once remat is capable of dealing with instructions with register
9082 // operands, expand this into two nodes instead of using a wrapper node.
9083 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9084}
9085
9086// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9087template <class NodeTy>
9088SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9089 unsigned Flags) const {
9090 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9091 SDLoc DL(N);
9092 EVT Ty = getPointerTy(DAG.getDataLayout());
9093 const unsigned char MO_NC = AArch64II::MO_NC;
9094 return DAG.getNode(
9096 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9097 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9098 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9099 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9100}
9101
9102// (addlow (adrp %hi(sym)) %lo(sym))
9103template <class NodeTy>
9104SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9105 unsigned Flags) const {
9106 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9107 SDLoc DL(N);
9108 EVT Ty = getPointerTy(DAG.getDataLayout());
9109 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9110 SDValue Lo = getTargetNode(N, Ty, DAG,
9113 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9114}
9115
9116// (adr sym)
9117template <class NodeTy>
9118SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9119 unsigned Flags) const {
9120 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9121 SDLoc DL(N);
9122 EVT Ty = getPointerTy(DAG.getDataLayout());
9123 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9124 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9125}
9126
9127SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9128 SelectionDAG &DAG) const {
9129 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9130 const GlobalValue *GV = GN->getGlobal();
9131 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9132
9133 if (OpFlags != AArch64II::MO_NO_FLAG)
9134 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9135 "unexpected offset in global node");
9136
9137 // This also catches the large code model case for Darwin, and tiny code
9138 // model with got relocations.
9139 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9140 return getGOT(GN, DAG, OpFlags);
9141 }
9142
9146 Result = getAddrLarge(GN, DAG, OpFlags);
9147 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9148 Result = getAddrTiny(GN, DAG, OpFlags);
9149 } else {
9150 Result = getAddr(GN, DAG, OpFlags);
9151 }
9152 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9153 SDLoc DL(GN);
9155 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9157 return Result;
9158}
9159
9160/// Convert a TLS address reference into the correct sequence of loads
9161/// and calls to compute the variable's address (for Darwin, currently) and
9162/// return an SDValue containing the final node.
9163
9164/// Darwin only has one TLS scheme which must be capable of dealing with the
9165/// fully general situation, in the worst case. This means:
9166/// + "extern __thread" declaration.
9167/// + Defined in a possibly unknown dynamic library.
9168///
9169/// The general system is that each __thread variable has a [3 x i64] descriptor
9170/// which contains information used by the runtime to calculate the address. The
9171/// only part of this the compiler needs to know about is the first xword, which
9172/// contains a function pointer that must be called with the address of the
9173/// entire descriptor in "x0".
9174///
9175/// Since this descriptor may be in a different unit, in general even the
9176/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9177/// is:
9178/// adrp x0, _var@TLVPPAGE
9179/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9180/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9181/// ; the function pointer
9182/// blr x1 ; Uses descriptor address in x0
9183/// ; Address of _var is now in x0.
9184///
9185/// If the address of _var's descriptor *is* known to the linker, then it can
9186/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9187/// a slight efficiency gain.
9188SDValue
9189AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9190 SelectionDAG &DAG) const {
9191 assert(Subtarget->isTargetDarwin() &&
9192 "This function expects a Darwin target");
9193
9194 SDLoc DL(Op);
9195 MVT PtrVT = getPointerTy(DAG.getDataLayout());
9196 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9197 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
9198
9199 SDValue TLVPAddr =
9200 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9201 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
9202
9203 // The first entry in the descriptor is a function pointer that we must call
9204 // to obtain the address of the variable.
9205 SDValue Chain = DAG.getEntryNode();
9206 SDValue FuncTLVGet = DAG.getLoad(
9207 PtrMemVT, DL, Chain, DescAddr,
9209 Align(PtrMemVT.getSizeInBits() / 8),
9211 Chain = FuncTLVGet.getValue(1);
9212
9213 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9214 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
9215
9217 MFI.setAdjustsStack(true);
9218
9219 // TLS calls preserve all registers except those that absolutely must be
9220 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9221 // silly).
9222 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9223 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9224 if (Subtarget->hasCustomCallingConv())
9225 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
9226
9227 // Finally, we can make the call. This is just a degenerate version of a
9228 // normal AArch64 call node: x0 takes the address of the descriptor, and
9229 // returns the address of the variable in this thread.
9230 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
9231 Chain =
9232 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
9233 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
9234 DAG.getRegisterMask(Mask), Chain.getValue(1));
9235 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
9236}
9237
9238/// Convert a thread-local variable reference into a sequence of instructions to
9239/// compute the variable's address for the local exec TLS model of ELF targets.
9240/// The sequence depends on the maximum TLS area size.
9241SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9242 SDValue ThreadBase,
9243 const SDLoc &DL,
9244 SelectionDAG &DAG) const {
9245 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9246 SDValue TPOff, Addr;
9247
9248 switch (DAG.getTarget().Options.TLSSize) {
9249 default:
9250 llvm_unreachable("Unexpected TLS size");
9251
9252 case 12: {
9253 // mrs x0, TPIDR_EL0
9254 // add x0, x0, :tprel_lo12:a
9256 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
9257 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9258 Var,
9259 DAG.getTargetConstant(0, DL, MVT::i32)),
9260 0);
9261 }
9262
9263 case 24: {
9264 // mrs x0, TPIDR_EL0
9265 // add x0, x0, :tprel_hi12:a
9266 // add x0, x0, :tprel_lo12_nc:a
9267 SDValue HiVar = DAG.getTargetGlobalAddress(
9268 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9269 SDValue LoVar = DAG.getTargetGlobalAddress(
9270 GV, DL, PtrVT, 0,
9272 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9273 HiVar,
9274 DAG.getTargetConstant(0, DL, MVT::i32)),
9275 0);
9276 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
9277 LoVar,
9278 DAG.getTargetConstant(0, DL, MVT::i32)),
9279 0);
9280 }
9281
9282 case 32: {
9283 // mrs x1, TPIDR_EL0
9284 // movz x0, #:tprel_g1:a
9285 // movk x0, #:tprel_g0_nc:a
9286 // add x0, x1, x0
9287 SDValue HiVar = DAG.getTargetGlobalAddress(
9288 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
9289 SDValue LoVar = DAG.getTargetGlobalAddress(
9290 GV, DL, PtrVT, 0,
9292 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9293 DAG.getTargetConstant(16, DL, MVT::i32)),
9294 0);
9295 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9296 DAG.getTargetConstant(0, DL, MVT::i32)),
9297 0);
9298 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9299 }
9300
9301 case 48: {
9302 // mrs x1, TPIDR_EL0
9303 // movz x0, #:tprel_g2:a
9304 // movk x0, #:tprel_g1_nc:a
9305 // movk x0, #:tprel_g0_nc:a
9306 // add x0, x1, x0
9307 SDValue HiVar = DAG.getTargetGlobalAddress(
9308 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
9309 SDValue MiVar = DAG.getTargetGlobalAddress(
9310 GV, DL, PtrVT, 0,
9312 SDValue LoVar = DAG.getTargetGlobalAddress(
9313 GV, DL, PtrVT, 0,
9315 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9316 DAG.getTargetConstant(32, DL, MVT::i32)),
9317 0);
9318 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9319 DAG.getTargetConstant(16, DL, MVT::i32)),
9320 0);
9321 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9322 DAG.getTargetConstant(0, DL, MVT::i32)),
9323 0);
9324 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9325 }
9326 }
9327}
9328
9329/// When accessing thread-local variables under either the general-dynamic or
9330/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9331/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9332/// is a function pointer to carry out the resolution.
9333///
9334/// The sequence is:
9335/// adrp x0, :tlsdesc:var
9336/// ldr x1, [x0, #:tlsdesc_lo12:var]
9337/// add x0, x0, #:tlsdesc_lo12:var
9338/// .tlsdesccall var
9339/// blr x1
9340/// (TPIDR_EL0 offset now in x0)
9341///
9342/// The above sequence must be produced unscheduled, to enable the linker to
9343/// optimize/relax this sequence.
9344/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9345/// above sequence, and expanded really late in the compilation flow, to ensure
9346/// the sequence is produced as per above.
9347SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9348 const SDLoc &DL,
9349 SelectionDAG &DAG) const {
9350 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9351
9352 SDValue Chain = DAG.getEntryNode();
9353 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9354
9355 Chain =
9356 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9357 SDValue Glue = Chain.getValue(1);
9358
9359 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9360}
9361
9362SDValue
9363AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9364 SelectionDAG &DAG) const {
9365 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9366
9367 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9368
9370
9372 if (Model == TLSModel::LocalDynamic)
9374 }
9375
9377 Model != TLSModel::LocalExec)
9378 report_fatal_error("ELF TLS only supported in small memory model or "
9379 "in local exec TLS model");
9380 // Different choices can be made for the maximum size of the TLS area for a
9381 // module. For the small address model, the default TLS size is 16MiB and the
9382 // maximum TLS size is 4GiB.
9383 // FIXME: add tiny and large code model support for TLS access models other
9384 // than local exec. We currently generate the same code as small for tiny,
9385 // which may be larger than needed.
9386
9387 SDValue TPOff;
9388 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9389 SDLoc DL(Op);
9390 const GlobalValue *GV = GA->getGlobal();
9391
9392 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9393
9394 if (Model == TLSModel::LocalExec) {
9395 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9396 } else if (Model == TLSModel::InitialExec) {
9397 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9398 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9399 } else if (Model == TLSModel::LocalDynamic) {
9400 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9401 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9402 // the beginning of the module's TLS region, followed by a DTPREL offset
9403 // calculation.
9404
9405 // These accesses will need deduplicating if there's more than one.
9406 AArch64FunctionInfo *MFI =
9409
9410 // The call needs a relocation too for linker relaxation. It doesn't make
9411 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9412 // the address.
9413 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9415
9416 // Now we can calculate the offset from TPIDR_EL0 to this module's
9417 // thread-local area.
9418 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9419
9420 // Now use :dtprel_whatever: operations to calculate this variable's offset
9421 // in its thread-storage area.
9422 SDValue HiVar = DAG.getTargetGlobalAddress(
9423 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9424 SDValue LoVar = DAG.getTargetGlobalAddress(
9425 GV, DL, MVT::i64, 0,
9427
9428 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9429 DAG.getTargetConstant(0, DL, MVT::i32)),
9430 0);
9431 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9432 DAG.getTargetConstant(0, DL, MVT::i32)),
9433 0);
9434 } else if (Model == TLSModel::GeneralDynamic) {
9435 // The call needs a relocation too for linker relaxation. It doesn't make
9436 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9437 // the address.
9438 SDValue SymAddr =
9439 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9440
9441 // Finally we can make a call to calculate the offset from tpidr_el0.
9442 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9443 } else
9444 llvm_unreachable("Unsupported ELF TLS access model");
9445
9446 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9447}
9448
9449SDValue
9450AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9451 SelectionDAG &DAG) const {
9452 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9453
9454 SDValue Chain = DAG.getEntryNode();
9455 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9456 SDLoc DL(Op);
9457
9458 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9459
9460 // Load the ThreadLocalStoragePointer from the TEB
9461 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9462 SDValue TLSArray =
9463 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9464 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9465 Chain = TLSArray.getValue(1);
9466
9467 // Load the TLS index from the C runtime;
9468 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9469 // This also does the same as LOADgot, but using a generic i32 load,
9470 // while LOADgot only loads i64.
9471 SDValue TLSIndexHi =
9472 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9473 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9474 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9475 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9476 SDValue TLSIndex =
9477 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9478 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9479 Chain = TLSIndex.getValue(1);
9480
9481 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9482 // offset into the TLSArray.
9483 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9484 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9485 DAG.getConstant(3, DL, PtrVT));
9486 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9487 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9489 Chain = TLS.getValue(1);
9490
9491 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9492 const GlobalValue *GV = GA->getGlobal();
9493 SDValue TGAHi = DAG.getTargetGlobalAddress(
9494 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9495 SDValue TGALo = DAG.getTargetGlobalAddress(
9496 GV, DL, PtrVT, 0,
9498
9499 // Add the offset from the start of the .tls section (section base).
9500 SDValue Addr =
9501 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9502 DAG.getTargetConstant(0, DL, MVT::i32)),
9503 0);
9504 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9505 return Addr;
9506}
9507
9508SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9509 SelectionDAG &DAG) const {
9510 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9511 if (DAG.getTarget().useEmulatedTLS())
9512 return LowerToTLSEmulatedModel(GA, DAG);
9513
9514 if (Subtarget->isTargetDarwin())
9515 return LowerDarwinGlobalTLSAddress(Op, DAG);
9516 if (Subtarget->isTargetELF())
9517 return LowerELFGlobalTLSAddress(Op, DAG);
9518 if (Subtarget->isTargetWindows())
9519 return LowerWindowsGlobalTLSAddress(Op, DAG);
9520
9521 llvm_unreachable("Unexpected platform trying to use TLS");
9522}
9523
9524//===----------------------------------------------------------------------===//
9525// PtrAuthGlobalAddress lowering
9526//
9527// We have 3 lowering alternatives to choose from:
9528// - MOVaddrPAC: similar to MOVaddr, with added PAC.
9529// If the GV doesn't need a GOT load (i.e., is locally defined)
9530// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
9531//
9532// - LOADgotPAC: similar to LOADgot, with added PAC.
9533// If the GV needs a GOT load, materialize the pointer using the usual
9534// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
9535// section is assumed to be read-only (for example, via relro mechanism). See
9536// LowerMOVaddrPAC.
9537//
9538// - LOADauthptrstatic: similar to LOADgot, but use a
9539// special stub slot instead of a GOT slot.
9540// Load a signed pointer for symbol 'sym' from a stub slot named
9541// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
9542// resolving. This usually lowers to adrp+ldr, but also emits an entry into
9543// .data with an
9544// @AUTH relocation. See LowerLOADauthptrstatic.
9545//
9546// All 3 are pseudos that are expand late to longer sequences: this lets us
9547// provide integrity guarantees on the to-be-signed intermediate values.
9548//
9549// LOADauthptrstatic is undesirable because it requires a large section filled
9550// with often similarly-signed pointers, making it a good harvesting target.
9551// Thus, it's only used for ptrauth references to extern_weak to avoid null
9552// checks.
9553
9554SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
9555 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
9556 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
9557 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
9558 assert(TGN->getGlobal()->hasExternalWeakLinkage());
9559
9560 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
9561 // offset alone as a pointer if the symbol wasn't available, which would
9562 // probably break null checks in users. Ptrauth complicates things further:
9563 // error out.
9564 if (TGN->getOffset() != 0)
9566 "unsupported non-zero offset in weak ptrauth global reference");
9567
9568 if (!isNullConstant(AddrDiscriminator))
9569 report_fatal_error("unsupported weak addr-div ptrauth global");
9570
9571 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
9572 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
9573 {TGA, Key, Discriminator}),
9574 0);
9575}
9576
9577SDValue
9578AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
9579 SelectionDAG &DAG) const {
9580 SDValue Ptr = Op.getOperand(0);
9581 uint64_t KeyC = Op.getConstantOperandVal(1);
9582 SDValue AddrDiscriminator = Op.getOperand(2);
9583 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
9584 EVT VT = Op.getValueType();
9585 SDLoc DL(Op);
9586
9587 if (KeyC > AArch64PACKey::LAST)
9588 report_fatal_error("key in ptrauth global out of range [0, " +
9589 Twine((int)AArch64PACKey::LAST) + "]");
9590
9591 // Blend only works if the integer discriminator is 16-bit wide.
9592 if (!isUInt<16>(DiscriminatorC))
9594 "constant discriminator in ptrauth global out of range [0, 0xffff]");
9595
9596 // Choosing between 3 lowering alternatives is target-specific.
9597 if (!Subtarget->isTargetELF())
9598 report_fatal_error("ptrauth global lowering is only implemented for ELF");
9599
9600 int64_t PtrOffsetC = 0;
9601 if (Ptr.getOpcode() == ISD::ADD) {
9602 PtrOffsetC = Ptr.getConstantOperandVal(1);
9603 Ptr = Ptr.getOperand(0);
9604 }
9605 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
9606 const GlobalValue *PtrGV = PtrN->getGlobal();
9607
9608 // Classify the reference to determine whether it needs a GOT load.
9609 const unsigned OpFlags =
9610 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
9611 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
9612 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
9613 "unsupported non-GOT op flags on ptrauth global reference");
9614
9615 // Fold any offset into the GV; our pseudos expect it there.
9616 PtrOffsetC += PtrN->getOffset();
9617 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
9618 /*TargetFlags=*/0);
9619 assert(PtrN->getTargetFlags() == 0 &&
9620 "unsupported target flags on ptrauth global");
9621
9622 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
9623 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
9624 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
9625 ? AddrDiscriminator
9626 : DAG.getRegister(AArch64::XZR, MVT::i64);
9627
9628 // No GOT load needed -> MOVaddrPAC
9629 if (!NeedsGOTLoad) {
9630 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
9631 return SDValue(
9632 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
9633 {TPtr, Key, TAddrDiscriminator, Discriminator}),
9634 0);
9635 }
9636
9637 // GOT load -> LOADgotPAC
9638 // Note that we disallow extern_weak refs to avoid null checks later.
9639 if (!PtrGV->hasExternalWeakLinkage())
9640 return SDValue(
9641 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
9642 {TPtr, Key, TAddrDiscriminator, Discriminator}),
9643 0);
9644
9645 // extern_weak ref -> LOADauthptrstatic
9646 return LowerPtrAuthGlobalAddressStatically(
9647 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
9648 DAG);
9649}
9650
9651// Looks through \param Val to determine the bit that can be used to
9652// check the sign of the value. It returns the unextended value and
9653// the sign bit position.
9654std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9655 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9656 return {Val.getOperand(0),
9657 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9658 1};
9659
9660 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9661 return {Val.getOperand(0),
9662 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9663
9664 return {Val, Val.getValueSizeInBits() - 1};
9665}
9666
9667SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9668 SDValue Chain = Op.getOperand(0);
9669 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9670 SDValue LHS = Op.getOperand(2);
9671 SDValue RHS = Op.getOperand(3);
9672 SDValue Dest = Op.getOperand(4);
9673 SDLoc dl(Op);
9674
9676 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9677 // will not be produced, as they are conditional branch instructions that do
9678 // not set flags.
9679 bool ProduceNonFlagSettingCondBr =
9680 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9681
9682 // Handle f128 first, since lowering it will result in comparing the return
9683 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9684 // is expecting to deal with.
9685 if (LHS.getValueType() == MVT::f128) {
9686 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9687
9688 // If softenSetCCOperands returned a scalar, we need to compare the result
9689 // against zero to select between true and false values.
9690 if (!RHS.getNode()) {
9691 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9692 CC = ISD::SETNE;
9693 }
9694 }
9695
9696 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9697 // instruction.
9698 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9699 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9700 // Only lower legal XALUO ops.
9701 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9702 return SDValue();
9703
9704 // The actual operation with overflow check.
9706 SDValue Value, Overflow;
9707 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9708
9709 if (CC == ISD::SETNE)
9710 OFCC = getInvertedCondCode(OFCC);
9711 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9712
9713 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9714 Overflow);
9715 }
9716
9717 if (LHS.getValueType().isInteger()) {
9718 assert((LHS.getValueType() == RHS.getValueType()) &&
9719 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9720
9721 // If the RHS of the comparison is zero, we can potentially fold this
9722 // to a specialized branch.
9723 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9724 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9725 if (CC == ISD::SETEQ) {
9726 // See if we can use a TBZ to fold in an AND as well.
9727 // TBZ has a smaller branch displacement than CBZ. If the offset is
9728 // out of bounds, a late MI-layer pass rewrites branches.
9729 // 403.gcc is an example that hits this case.
9730 if (LHS.getOpcode() == ISD::AND &&
9731 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9732 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9733 SDValue Test = LHS.getOperand(0);
9734 uint64_t Mask = LHS.getConstantOperandVal(1);
9735 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9736 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9737 Dest);
9738 }
9739
9740 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9741 } else if (CC == ISD::SETNE) {
9742 // See if we can use a TBZ to fold in an AND as well.
9743 // TBZ has a smaller branch displacement than CBZ. If the offset is
9744 // out of bounds, a late MI-layer pass rewrites branches.
9745 // 403.gcc is an example that hits this case.
9746 if (LHS.getOpcode() == ISD::AND &&
9747 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9748 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9749 SDValue Test = LHS.getOperand(0);
9750 uint64_t Mask = LHS.getConstantOperandVal(1);
9751 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9752 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9753 Dest);
9754 }
9755
9756 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9757 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9758 // Don't combine AND since emitComparison converts the AND to an ANDS
9759 // (a.k.a. TST) and the test in the test bit and branch instruction
9760 // becomes redundant. This would also increase register pressure.
9761 uint64_t SignBitPos;
9762 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9763 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9764 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9765 }
9766 }
9767 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9768 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9769 // Don't combine AND since emitComparison converts the AND to an ANDS
9770 // (a.k.a. TST) and the test in the test bit and branch instruction
9771 // becomes redundant. This would also increase register pressure.
9772 uint64_t SignBitPos;
9773 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9774 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9775 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9776 }
9777
9778 SDValue CCVal;
9779 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9780 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9781 Cmp);
9782 }
9783
9784 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9785 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9786
9787 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9788 // clean. Some of them require two branches to implement.
9789 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9790 AArch64CC::CondCode CC1, CC2;
9791 changeFPCCToAArch64CC(CC, CC1, CC2);
9792 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9793 SDValue BR1 =
9794 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9795 if (CC2 != AArch64CC::AL) {
9796 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9797 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9798 Cmp);
9799 }
9800
9801 return BR1;
9802}
9803
9804SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9805 SelectionDAG &DAG) const {
9806 if (!Subtarget->isNeonAvailable() &&
9807 !Subtarget->useSVEForFixedLengthVectors())
9808 return SDValue();
9809
9810 EVT VT = Op.getValueType();
9811 EVT IntVT = VT.changeTypeToInteger();
9812 SDLoc DL(Op);
9813
9814 SDValue In1 = Op.getOperand(0);
9815 SDValue In2 = Op.getOperand(1);
9816 EVT SrcVT = In2.getValueType();
9817
9818 if (!SrcVT.bitsEq(VT))
9819 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9820
9821 if (VT.isScalableVector())
9822 IntVT =
9824
9825 if (VT.isFixedLengthVector() &&
9826 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9827 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9828
9829 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9830 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9831
9832 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9833 return convertFromScalableVector(DAG, VT, Res);
9834 }
9835
9836 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9837 if (VT.isScalableVector())
9838 return getSVESafeBitCast(VT, Op, DAG);
9839
9840 return DAG.getBitcast(VT, Op);
9841 };
9842
9843 SDValue VecVal1, VecVal2;
9844 EVT VecVT;
9845 auto SetVecVal = [&](int Idx = -1) {
9846 if (!VT.isVector()) {
9847 VecVal1 =
9848 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9849 VecVal2 =
9850 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9851 } else {
9852 VecVal1 = BitCast(VecVT, In1, DAG);
9853 VecVal2 = BitCast(VecVT, In2, DAG);
9854 }
9855 };
9856 if (VT.isVector()) {
9857 VecVT = IntVT;
9858 SetVecVal();
9859 } else if (VT == MVT::f64) {
9860 VecVT = MVT::v2i64;
9861 SetVecVal(AArch64::dsub);
9862 } else if (VT == MVT::f32) {
9863 VecVT = MVT::v4i32;
9864 SetVecVal(AArch64::ssub);
9865 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9866 VecVT = MVT::v8i16;
9867 SetVecVal(AArch64::hsub);
9868 } else {
9869 llvm_unreachable("Invalid type for copysign!");
9870 }
9871
9872 unsigned BitWidth = In1.getScalarValueSizeInBits();
9873 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9874
9875 // We want to materialize a mask with every bit but the high bit set, but the
9876 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9877 // 64-bit elements. Instead, materialize all bits set and then negate that.
9878 if (VT == MVT::f64 || VT == MVT::v2f64) {
9879 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9880 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9881 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9882 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9883 }
9884
9885 SDValue BSP =
9886 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9887 if (VT == MVT::f16 || VT == MVT::bf16)
9888 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9889 if (VT == MVT::f32)
9890 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9891 if (VT == MVT::f64)
9892 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9893
9894 return BitCast(VT, BSP, DAG);
9895}
9896
9897SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9898 SelectionDAG &DAG) const {
9900 Attribute::NoImplicitFloat))
9901 return SDValue();
9902
9903 EVT VT = Op.getValueType();
9904 if (VT.isScalableVector() ||
9906 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9907
9908 if (!Subtarget->isNeonAvailable())
9909 return SDValue();
9910
9911 bool IsParity = Op.getOpcode() == ISD::PARITY;
9912 SDValue Val = Op.getOperand(0);
9913 SDLoc DL(Op);
9914
9915 // for i32, general parity function using EORs is more efficient compared to
9916 // using floating point
9917 if (VT == MVT::i32 && IsParity)
9918 return SDValue();
9919
9920 // If there is no CNT instruction available, GPR popcount can
9921 // be more efficiently lowered to the following sequence that uses
9922 // AdvSIMD registers/instructions as long as the copies to/from
9923 // the AdvSIMD registers are cheap.
9924 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9925 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9926 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9927 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9928 if (VT == MVT::i32 || VT == MVT::i64) {
9929 if (VT == MVT::i32)
9930 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9931 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9932
9933 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9934 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9935 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9936 DAG.getConstant(0, DL, MVT::i64));
9937
9938 if (IsParity)
9939 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9940 DAG.getConstant(1, DL, MVT::i32));
9941
9942 if (VT == MVT::i64)
9943 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9944 return UaddLV;
9945 } else if (VT == MVT::i128) {
9946 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9947
9948 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9949 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9950 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9951 DAG.getConstant(0, DL, MVT::i64));
9952
9953 if (IsParity)
9954 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9955 DAG.getConstant(1, DL, MVT::i32));
9956
9957 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9958 }
9959
9960 assert(!IsParity && "ISD::PARITY of vector types not supported");
9961
9962 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9963 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9964 "Unexpected type for custom ctpop lowering");
9965
9966 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9967 Val = DAG.getBitcast(VT8Bit, Val);
9968 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9969
9970 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9971 unsigned EltSize = 8;
9972 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9973 while (EltSize != VT.getScalarSizeInBits()) {
9974 EltSize *= 2;
9975 NumElts /= 2;
9976 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9977 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
9978 }
9979
9980 return Val;
9981}
9982
9983SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9984 EVT VT = Op.getValueType();
9985 assert(VT.isScalableVector() ||
9987 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9988
9989 SDLoc DL(Op);
9990 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9991 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9992}
9993
9994SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9995 SelectionDAG &DAG) const {
9996
9997 EVT VT = Op.getValueType();
9998 SDLoc DL(Op);
9999 unsigned Opcode = Op.getOpcode();
10001 switch (Opcode) {
10002 default:
10003 llvm_unreachable("Wrong instruction");
10004 case ISD::SMAX:
10005 CC = ISD::SETGT;
10006 break;
10007 case ISD::SMIN:
10008 CC = ISD::SETLT;
10009 break;
10010 case ISD::UMAX:
10011 CC = ISD::SETUGT;
10012 break;
10013 case ISD::UMIN:
10014 CC = ISD::SETULT;
10015 break;
10016 }
10017
10018 if (VT.isScalableVector() ||
10020 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10021 switch (Opcode) {
10022 default:
10023 llvm_unreachable("Wrong instruction");
10024 case ISD::SMAX:
10025 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10026 case ISD::SMIN:
10027 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10028 case ISD::UMAX:
10029 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10030 case ISD::UMIN:
10031 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10032 }
10033 }
10034
10035 SDValue Op0 = Op.getOperand(0);
10036 SDValue Op1 = Op.getOperand(1);
10037 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10038 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10039}
10040
10041SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10042 SelectionDAG &DAG) const {
10043 EVT VT = Op.getValueType();
10044
10045 if (VT.isScalableVector() ||
10047 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10048 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10049
10050 SDLoc DL(Op);
10051 SDValue REVB;
10052 MVT VST;
10053
10054 switch (VT.getSimpleVT().SimpleTy) {
10055 default:
10056 llvm_unreachable("Invalid type for bitreverse!");
10057
10058 case MVT::v2i32: {
10059 VST = MVT::v8i8;
10060 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10061
10062 break;
10063 }
10064
10065 case MVT::v4i32: {
10066 VST = MVT::v16i8;
10067 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10068
10069 break;
10070 }
10071
10072 case MVT::v1i64: {
10073 VST = MVT::v8i8;
10074 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10075
10076 break;
10077 }
10078
10079 case MVT::v2i64: {
10080 VST = MVT::v16i8;
10081 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10082
10083 break;
10084 }
10085 }
10086
10087 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10088 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10089}
10090
10091// Check whether the continuous comparison sequence.
10092static bool
10093isOrXorChain(SDValue N, unsigned &Num,
10094 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10095 if (Num == MaxXors)
10096 return false;
10097
10098 // Skip the one-use zext
10099 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10100 N = N->getOperand(0);
10101
10102 // The leaf node must be XOR
10103 if (N->getOpcode() == ISD::XOR) {
10104 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10105 Num++;
10106 return true;
10107 }
10108
10109 // All the non-leaf nodes must be OR.
10110 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10111 return false;
10112
10113 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10114 isOrXorChain(N->getOperand(1), Num, WorkList))
10115 return true;
10116 return false;
10117}
10118
10119// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10121 SDValue LHS = N->getOperand(0);
10122 SDValue RHS = N->getOperand(1);
10123 SDLoc DL(N);
10124 EVT VT = N->getValueType(0);
10126
10127 // Only handle integer compares.
10128 if (N->getOpcode() != ISD::SETCC)
10129 return SDValue();
10130
10131 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10132 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10133 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10134 unsigned NumXors = 0;
10135 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10136 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10137 isOrXorChain(LHS, NumXors, WorkList)) {
10138 SDValue XOR0, XOR1;
10139 std::tie(XOR0, XOR1) = WorkList[0];
10140 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10141 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
10142 for (unsigned I = 1; I < WorkList.size(); I++) {
10143 std::tie(XOR0, XOR1) = WorkList[I];
10144 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
10145 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
10146 }
10147
10148 // Exit early by inverting the condition, which help reduce indentations.
10149 return Cmp;
10150 }
10151
10152 return SDValue();
10153}
10154
10155SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10156
10157 if (Op.getValueType().isVector())
10158 return LowerVSETCC(Op, DAG);
10159
10160 bool IsStrict = Op->isStrictFPOpcode();
10161 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10162 unsigned OpNo = IsStrict ? 1 : 0;
10163 SDValue Chain;
10164 if (IsStrict)
10165 Chain = Op.getOperand(0);
10166 SDValue LHS = Op.getOperand(OpNo + 0);
10167 SDValue RHS = Op.getOperand(OpNo + 1);
10168 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
10169 SDLoc dl(Op);
10170
10171 // We chose ZeroOrOneBooleanContents, so use zero and one.
10172 EVT VT = Op.getValueType();
10173 SDValue TVal = DAG.getConstant(1, dl, VT);
10174 SDValue FVal = DAG.getConstant(0, dl, VT);
10175
10176 // Handle f128 first, since one possible outcome is a normal integer
10177 // comparison which gets picked up by the next if statement.
10178 if (LHS.getValueType() == MVT::f128) {
10179 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
10180 IsSignaling);
10181
10182 // If softenSetCCOperands returned a scalar, use it.
10183 if (!RHS.getNode()) {
10184 assert(LHS.getValueType() == Op.getValueType() &&
10185 "Unexpected setcc expansion!");
10186 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
10187 }
10188 }
10189
10190 if (LHS.getValueType().isInteger()) {
10191 SDValue CCVal;
10193 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
10194
10195 // Note that we inverted the condition above, so we reverse the order of
10196 // the true and false operands here. This will allow the setcc to be
10197 // matched to a single CSINC instruction.
10198 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
10199 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
10200 }
10201
10202 // Now we know we're dealing with FP values.
10203 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
10204 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10205
10206 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
10207 // and do the comparison.
10208 SDValue Cmp;
10209 if (IsStrict)
10210 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
10211 else
10212 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10213
10214 AArch64CC::CondCode CC1, CC2;
10215 changeFPCCToAArch64CC(CC, CC1, CC2);
10216 SDValue Res;
10217 if (CC2 == AArch64CC::AL) {
10218 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
10219 CC2);
10220 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10221
10222 // Note that we inverted the condition above, so we reverse the order of
10223 // the true and false operands here. This will allow the setcc to be
10224 // matched to a single CSINC instruction.
10225 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
10226 } else {
10227 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
10228 // totally clean. Some of them require two CSELs to implement. As is in
10229 // this case, we emit the first CSEL and then emit a second using the output
10230 // of the first as the RHS. We're effectively OR'ing the two CC's together.
10231
10232 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
10233 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10234 SDValue CS1 =
10235 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10236
10237 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10238 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10239 }
10240 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
10241}
10242
10243SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
10244 SelectionDAG &DAG) const {
10245
10246 SDValue LHS = Op.getOperand(0);
10247 SDValue RHS = Op.getOperand(1);
10248 EVT VT = LHS.getValueType();
10249 if (VT != MVT::i32 && VT != MVT::i64)
10250 return SDValue();
10251
10252 SDLoc DL(Op);
10253 SDValue Carry = Op.getOperand(2);
10254 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
10255 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
10256 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
10257 LHS, RHS, InvCarry);
10258
10259 EVT OpVT = Op.getValueType();
10260 SDValue TVal = DAG.getConstant(1, DL, OpVT);
10261 SDValue FVal = DAG.getConstant(0, DL, OpVT);
10262
10263 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10265 SDValue CCVal =
10266 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
10267 // Inputs are swapped because the condition is inverted. This will allow
10268 // matching with a single CSINC instruction.
10269 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
10270 Cmp.getValue(1));
10271}
10272
10273SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
10274 SDValue RHS, SDValue TVal,
10275 SDValue FVal, const SDLoc &dl,
10276 SelectionDAG &DAG) const {
10277 // Handle f128 first, because it will result in a comparison of some RTLIB
10278 // call result against zero.
10279 if (LHS.getValueType() == MVT::f128) {
10280 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10281
10282 // If softenSetCCOperands returned a scalar, we need to compare the result
10283 // against zero to select between true and false values.
10284 if (!RHS.getNode()) {
10285 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10286 CC = ISD::SETNE;
10287 }
10288 }
10289
10290 // Also handle f16, for which we need to do a f32 comparison.
10291 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
10292 LHS.getValueType() == MVT::bf16) {
10293 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
10294 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
10295 }
10296
10297 // Next, handle integers.
10298 if (LHS.getValueType().isInteger()) {
10299 assert((LHS.getValueType() == RHS.getValueType()) &&
10300 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10301
10302 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
10303 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
10304 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10305 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
10306 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
10307 // supported types.
10308 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
10309 CTVal->isOne() && CFVal->isAllOnes() &&
10310 LHS.getValueType() == TVal.getValueType()) {
10311 EVT VT = LHS.getValueType();
10312 SDValue Shift =
10313 DAG.getNode(ISD::SRA, dl, VT, LHS,
10314 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
10315 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
10316 }
10317
10318 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
10319 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
10320 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
10321 // Both require less instructions than compare and conditional select.
10322 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
10323 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
10324 LHS.getValueType() == RHS.getValueType()) {
10325 EVT VT = LHS.getValueType();
10326 SDValue Shift =
10327 DAG.getNode(ISD::SRA, dl, VT, LHS,
10328 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
10329
10330 if (CC == ISD::SETGT)
10331 Shift = DAG.getNOT(dl, Shift, VT);
10332
10333 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
10334 }
10335
10336 unsigned Opcode = AArch64ISD::CSEL;
10337
10338 // If both the TVal and the FVal are constants, see if we can swap them in
10339 // order to for a CSINV or CSINC out of them.
10340 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
10341 std::swap(TVal, FVal);
10342 std::swap(CTVal, CFVal);
10343 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10344 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
10345 std::swap(TVal, FVal);
10346 std::swap(CTVal, CFVal);
10347 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10348 } else if (TVal.getOpcode() == ISD::XOR) {
10349 // If TVal is a NOT we want to swap TVal and FVal so that we can match
10350 // with a CSINV rather than a CSEL.
10351 if (isAllOnesConstant(TVal.getOperand(1))) {
10352 std::swap(TVal, FVal);
10353 std::swap(CTVal, CFVal);
10354 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10355 }
10356 } else if (TVal.getOpcode() == ISD::SUB) {
10357 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
10358 // that we can match with a CSNEG rather than a CSEL.
10359 if (isNullConstant(TVal.getOperand(0))) {
10360 std::swap(TVal, FVal);
10361 std::swap(CTVal, CFVal);
10362 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10363 }
10364 } else if (CTVal && CFVal) {
10365 const int64_t TrueVal = CTVal->getSExtValue();
10366 const int64_t FalseVal = CFVal->getSExtValue();
10367 bool Swap = false;
10368
10369 // If both TVal and FVal are constants, see if FVal is the
10370 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
10371 // instead of a CSEL in that case.
10372 if (TrueVal == ~FalseVal) {
10373 Opcode = AArch64ISD::CSINV;
10374 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
10375 TrueVal == -FalseVal) {
10376 Opcode = AArch64ISD::CSNEG;
10377 } else if (TVal.getValueType() == MVT::i32) {
10378 // If our operands are only 32-bit wide, make sure we use 32-bit
10379 // arithmetic for the check whether we can use CSINC. This ensures that
10380 // the addition in the check will wrap around properly in case there is
10381 // an overflow (which would not be the case if we do the check with
10382 // 64-bit arithmetic).
10383 const uint32_t TrueVal32 = CTVal->getZExtValue();
10384 const uint32_t FalseVal32 = CFVal->getZExtValue();
10385
10386 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
10387 Opcode = AArch64ISD::CSINC;
10388
10389 if (TrueVal32 > FalseVal32) {
10390 Swap = true;
10391 }
10392 }
10393 } else {
10394 // 64-bit check whether we can use CSINC.
10395 const uint64_t TrueVal64 = TrueVal;
10396 const uint64_t FalseVal64 = FalseVal;
10397
10398 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
10399 Opcode = AArch64ISD::CSINC;
10400
10401 if (TrueVal > FalseVal) {
10402 Swap = true;
10403 }
10404 }
10405 }
10406
10407 // Swap TVal and FVal if necessary.
10408 if (Swap) {
10409 std::swap(TVal, FVal);
10410 std::swap(CTVal, CFVal);
10411 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10412 }
10413
10414 if (Opcode != AArch64ISD::CSEL) {
10415 // Drop FVal since we can get its value by simply inverting/negating
10416 // TVal.
10417 FVal = TVal;
10418 }
10419 }
10420
10421 // Avoid materializing a constant when possible by reusing a known value in
10422 // a register. However, don't perform this optimization if the known value
10423 // is one, zero or negative one in the case of a CSEL. We can always
10424 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10425 // FVal, respectively.
10426 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
10427 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10428 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10430 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10431 // "a != C ? x : a" to avoid materializing C.
10432 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10433 TVal = LHS;
10434 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10435 FVal = LHS;
10436 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10437 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10438 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10439 // avoid materializing C.
10441 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10442 Opcode = AArch64ISD::CSINV;
10443 TVal = LHS;
10444 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10445 }
10446 }
10447
10448 SDValue CCVal;
10449 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10450 EVT VT = TVal.getValueType();
10451 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10452 }
10453
10454 // Now we know we're dealing with FP values.
10455 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10456 LHS.getValueType() == MVT::f64);
10457 assert(LHS.getValueType() == RHS.getValueType());
10458 EVT VT = TVal.getValueType();
10459 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10460
10461 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10462 // clean. Some of them require two CSELs to implement.
10463 AArch64CC::CondCode CC1, CC2;
10464 changeFPCCToAArch64CC(CC, CC1, CC2);
10465
10466 if (DAG.getTarget().Options.UnsafeFPMath) {
10467 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10468 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10469 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10470 if (RHSVal && RHSVal->isZero()) {
10471 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10472 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10473
10474 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10475 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10476 TVal = LHS;
10477 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10478 CFVal && CFVal->isZero() &&
10479 FVal.getValueType() == LHS.getValueType())
10480 FVal = LHS;
10481 }
10482 }
10483
10484 // Emit first, and possibly only, CSEL.
10485 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10486 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10487
10488 // If we need a second CSEL, emit it, using the output of the first as the
10489 // RHS. We're effectively OR'ing the two CC's together.
10490 if (CC2 != AArch64CC::AL) {
10491 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10492 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10493 }
10494
10495 // Otherwise, return the output of the first CSEL.
10496 return CS1;
10497}
10498
10499SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10500 SelectionDAG &DAG) const {
10501 EVT Ty = Op.getValueType();
10502 auto Idx = Op.getConstantOperandAPInt(2);
10503 int64_t IdxVal = Idx.getSExtValue();
10504 assert(Ty.isScalableVector() &&
10505 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10506
10507 // We can use the splice instruction for certain index values where we are
10508 // able to efficiently generate the correct predicate. The index will be
10509 // inverted and used directly as the input to the ptrue instruction, i.e.
10510 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10511 // splice predicate. However, we can only do this if we can guarantee that
10512 // there are enough elements in the vector, hence we check the index <= min
10513 // number of elements.
10514 std::optional<unsigned> PredPattern;
10515 if (Ty.isScalableVector() && IdxVal < 0 &&
10516 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10517 std::nullopt) {
10518 SDLoc DL(Op);
10519
10520 // Create a predicate where all but the last -IdxVal elements are false.
10521 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10522 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10523 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10524
10525 // Now splice the two inputs together using the predicate.
10526 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10527 Op.getOperand(1));
10528 }
10529
10530 // We can select to an EXT instruction when indexing the first 256 bytes.
10532 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
10533 return Op;
10534
10535 return SDValue();
10536}
10537
10538SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10539 SelectionDAG &DAG) const {
10540 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10541 SDValue LHS = Op.getOperand(0);
10542 SDValue RHS = Op.getOperand(1);
10543 SDValue TVal = Op.getOperand(2);
10544 SDValue FVal = Op.getOperand(3);
10545 SDLoc DL(Op);
10546 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10547}
10548
10549SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10550 SelectionDAG &DAG) const {
10551 SDValue CCVal = Op->getOperand(0);
10552 SDValue TVal = Op->getOperand(1);
10553 SDValue FVal = Op->getOperand(2);
10554 SDLoc DL(Op);
10555
10556 EVT Ty = Op.getValueType();
10557 if (Ty == MVT::aarch64svcount) {
10558 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10559 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10560 SDValue Sel =
10561 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10562 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10563 }
10564
10565 if (Ty.isScalableVector()) {
10566 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10567 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10568 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10569 }
10570
10571 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10572 // FIXME: Ideally this would be the same as above using i1 types, however
10573 // for the moment we can't deal with fixed i1 vector types properly, so
10574 // instead extend the predicate to a result type sized integer vector.
10575 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10576 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10577 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10578 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10579 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10580 }
10581
10582 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10583 // instruction.
10584 if (ISD::isOverflowIntrOpRes(CCVal)) {
10585 // Only lower legal XALUO ops.
10586 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10587 return SDValue();
10588
10590 SDValue Value, Overflow;
10591 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10592 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10593
10594 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10595 CCVal, Overflow);
10596 }
10597
10598 // Lower it the same way as we would lower a SELECT_CC node.
10600 SDValue LHS, RHS;
10601 if (CCVal.getOpcode() == ISD::SETCC) {
10602 LHS = CCVal.getOperand(0);
10603 RHS = CCVal.getOperand(1);
10604 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10605 } else {
10606 LHS = CCVal;
10607 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10608 CC = ISD::SETNE;
10609 }
10610
10611 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10612 // order to use FCSELSrrr
10613 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10614 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10615 DAG.getUNDEF(MVT::f32), TVal);
10616 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10617 DAG.getUNDEF(MVT::f32), FVal);
10618 }
10619
10620 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10621
10622 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10623 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10624 }
10625
10626 return Res;
10627}
10628
10629SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10630 SelectionDAG &DAG) const {
10631 // Jump table entries as PC relative offsets. No additional tweaking
10632 // is necessary here. Just get the address of the jump table.
10633 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10634
10637 !Subtarget->isTargetMachO())
10638 return getAddrLarge(JT, DAG);
10639 if (CM == CodeModel::Tiny)
10640 return getAddrTiny(JT, DAG);
10641 return getAddr(JT, DAG);
10642}
10643
10644SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10645 SelectionDAG &DAG) const {
10646 // Jump table entries as PC relative offsets. No additional tweaking
10647 // is necessary here. Just get the address of the jump table.
10648 SDLoc DL(Op);
10649 SDValue JT = Op.getOperand(1);
10650 SDValue Entry = Op.getOperand(2);
10651 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10652
10653 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10654 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10655
10656 SDNode *Dest =
10657 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10658 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10659 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10660 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10661}
10662
10663SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10664 SelectionDAG &DAG) const {
10665 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10667 if (CM == CodeModel::Large) {
10668 // Use the GOT for the large code model on iOS.
10669 if (Subtarget->isTargetMachO()) {
10670 return getGOT(CP, DAG);
10671 }
10673 return getAddrLarge(CP, DAG);
10674 } else if (CM == CodeModel::Tiny) {
10675 return getAddrTiny(CP, DAG);
10676 }
10677 return getAddr(CP, DAG);
10678}
10679
10680SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10681 SelectionDAG &DAG) const {
10682 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10684 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10686 return getAddrLarge(BA, DAG);
10687 } else if (CM == CodeModel::Tiny) {
10688 return getAddrTiny(BA, DAG);
10689 }
10690 return getAddr(BA, DAG);
10691}
10692
10693SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10694 SelectionDAG &DAG) const {
10695 AArch64FunctionInfo *FuncInfo =
10697
10698 SDLoc DL(Op);
10699 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10701 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10702 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10703 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10704 MachinePointerInfo(SV));
10705}
10706
10707SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10708 SelectionDAG &DAG) const {
10711
10712 SDLoc DL(Op);
10713 SDValue FR;
10714 if (Subtarget->isWindowsArm64EC()) {
10715 // With the Arm64EC ABI, we compute the address of the varargs save area
10716 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10717 // but calls from an entry thunk can pass in a different address.
10718 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10719 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10721 if (FuncInfo->getVarArgsGPRSize() > 0)
10722 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10723 else
10724 StackOffset = FuncInfo->getVarArgsStackOffset();
10725 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10726 DAG.getConstant(StackOffset, DL, MVT::i64));
10727 } else {
10728 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10729 ? FuncInfo->getVarArgsGPRIndex()
10730 : FuncInfo->getVarArgsStackIndex(),
10732 }
10733 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10734 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10735 MachinePointerInfo(SV));
10736}
10737
10738SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10739 SelectionDAG &DAG) const {
10740 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10741 // Standard, section B.3.
10744 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10745 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10746 auto PtrVT = getPointerTy(DAG.getDataLayout());
10747 SDLoc DL(Op);
10748
10749 SDValue Chain = Op.getOperand(0);
10750 SDValue VAList = Op.getOperand(1);
10751 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10753
10754 // void *__stack at offset 0
10755 unsigned Offset = 0;
10756 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10757 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10758 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10759 MachinePointerInfo(SV), Align(PtrSize)));
10760
10761 // void *__gr_top at offset 8 (4 on ILP32)
10762 Offset += PtrSize;
10763 int GPRSize = FuncInfo->getVarArgsGPRSize();
10764 if (GPRSize > 0) {
10765 SDValue GRTop, GRTopAddr;
10766
10767 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10768 DAG.getConstant(Offset, DL, PtrVT));
10769
10770 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10771 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10772 DAG.getConstant(GPRSize, DL, PtrVT));
10773 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10774
10775 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10777 Align(PtrSize)));
10778 }
10779
10780 // void *__vr_top at offset 16 (8 on ILP32)
10781 Offset += PtrSize;
10782 int FPRSize = FuncInfo->getVarArgsFPRSize();
10783 if (FPRSize > 0) {
10784 SDValue VRTop, VRTopAddr;
10785 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10786 DAG.getConstant(Offset, DL, PtrVT));
10787
10788 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10789 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10790 DAG.getConstant(FPRSize, DL, PtrVT));
10791 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10792
10793 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10795 Align(PtrSize)));
10796 }
10797
10798 // int __gr_offs at offset 24 (12 on ILP32)
10799 Offset += PtrSize;
10800 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10801 DAG.getConstant(Offset, DL, PtrVT));
10802 MemOps.push_back(
10803 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10804 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10805
10806 // int __vr_offs at offset 28 (16 on ILP32)
10807 Offset += 4;
10808 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10809 DAG.getConstant(Offset, DL, PtrVT));
10810 MemOps.push_back(
10811 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10812 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10813
10814 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10815}
10816
10817SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10818 SelectionDAG &DAG) const {
10820
10821 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10822 return LowerWin64_VASTART(Op, DAG);
10823 else if (Subtarget->isTargetDarwin())
10824 return LowerDarwin_VASTART(Op, DAG);
10825 else
10826 return LowerAAPCS_VASTART(Op, DAG);
10827}
10828
10829SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10830 SelectionDAG &DAG) const {
10831 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10832 // pointer.
10833 SDLoc DL(Op);
10834 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10835 unsigned VaListSize =
10836 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10837 ? PtrSize
10838 : Subtarget->isTargetILP32() ? 20 : 32;
10839 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10840 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10841
10842 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10843 DAG.getConstant(VaListSize, DL, MVT::i32),
10844 Align(PtrSize), false, false, false,
10845 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10846}
10847
10848SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10849 assert(Subtarget->isTargetDarwin() &&
10850 "automatic va_arg instruction only works on Darwin");
10851
10852 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10853 EVT VT = Op.getValueType();
10854 SDLoc DL(Op);
10855 SDValue Chain = Op.getOperand(0);
10856 SDValue Addr = Op.getOperand(1);
10857 MaybeAlign Align(Op.getConstantOperandVal(3));
10858 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10859 auto PtrVT = getPointerTy(DAG.getDataLayout());
10860 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10861 SDValue VAList =
10862 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10863 Chain = VAList.getValue(1);
10864 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10865
10866 if (VT.isScalableVector())
10867 report_fatal_error("Passing SVE types to variadic functions is "
10868 "currently not supported");
10869
10870 if (Align && *Align > MinSlotSize) {
10871 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10872 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10873 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10874 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10875 }
10876
10877 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10878 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10879
10880 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10881 // up to 64 bits. At the very least, we have to increase the striding of the
10882 // vaargs list to match this, and for FP values we need to introduce
10883 // FP_ROUND nodes as well.
10884 if (VT.isInteger() && !VT.isVector())
10885 ArgSize = std::max(ArgSize, MinSlotSize);
10886 bool NeedFPTrunc = false;
10887 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10888 ArgSize = 8;
10889 NeedFPTrunc = true;
10890 }
10891
10892 // Increment the pointer, VAList, to the next vaarg
10893 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10894 DAG.getConstant(ArgSize, DL, PtrVT));
10895 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10896
10897 // Store the incremented VAList to the legalized pointer
10898 SDValue APStore =
10899 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10900
10901 // Load the actual argument out of the pointer VAList
10902 if (NeedFPTrunc) {
10903 // Load the value as an f64.
10904 SDValue WideFP =
10905 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10906 // Round the value down to an f32.
10907 SDValue NarrowFP =
10908 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10909 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10910 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10911 // Merge the rounded value with the chain output of the load.
10912 return DAG.getMergeValues(Ops, DL);
10913 }
10914
10915 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10916}
10917
10918SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10919 SelectionDAG &DAG) const {
10921 MFI.setFrameAddressIsTaken(true);
10922
10923 EVT VT = Op.getValueType();
10924 SDLoc DL(Op);
10925 unsigned Depth = Op.getConstantOperandVal(0);
10926 SDValue FrameAddr =
10927 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10928 while (Depth--)
10929 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10931
10932 if (Subtarget->isTargetILP32())
10933 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10934 DAG.getValueType(VT));
10935
10936 return FrameAddr;
10937}
10938
10939SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10940 SelectionDAG &DAG) const {
10942
10943 EVT VT = getPointerTy(DAG.getDataLayout());
10944 SDLoc DL(Op);
10945 int FI = MFI.CreateFixedObject(4, 0, false);
10946 return DAG.getFrameIndex(FI, VT);
10947}
10948
10949#define GET_REGISTER_MATCHER
10950#include "AArch64GenAsmMatcher.inc"
10951
10952// FIXME? Maybe this could be a TableGen attribute on some registers and
10953// this table could be generated automatically from RegInfo.
10954Register AArch64TargetLowering::
10955getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10957 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10958 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10959 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10960 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10961 !MRI->isReservedReg(MF, Reg))
10962 Reg = 0;
10963 }
10964 if (Reg)
10965 return Reg;
10966 report_fatal_error(Twine("Invalid register name \""
10967 + StringRef(RegName) + "\"."));
10968}
10969
10970SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10971 SelectionDAG &DAG) const {
10973
10974 EVT VT = Op.getValueType();
10975 SDLoc DL(Op);
10976
10977 SDValue FrameAddr =
10978 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10980
10981 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10982}
10983
10984SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10985 SelectionDAG &DAG) const {
10987 MachineFrameInfo &MFI = MF.getFrameInfo();
10988 MFI.setReturnAddressIsTaken(true);
10989
10990 EVT VT = Op.getValueType();
10991 SDLoc DL(Op);
10992 unsigned Depth = Op.getConstantOperandVal(0);
10993 SDValue ReturnAddress;
10994 if (Depth) {
10995 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10997 ReturnAddress = DAG.getLoad(
10998 VT, DL, DAG.getEntryNode(),
10999 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11000 } else {
11001 // Return LR, which contains the return address. Mark it an implicit
11002 // live-in.
11003 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11004 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11005 }
11006
11007 // The XPACLRI instruction assembles to a hint-space instruction before
11008 // Armv8.3-A therefore this instruction can be safely used for any pre
11009 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11010 // that instead.
11011 SDNode *St;
11012 if (Subtarget->hasPAuth()) {
11013 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11014 } else {
11015 // XPACLRI operates on LR therefore we must move the operand accordingly.
11016 SDValue Chain =
11017 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11018 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11019 }
11020 return SDValue(St, 0);
11021}
11022
11023/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11024/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11025SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11026 SelectionDAG &DAG) const {
11027 SDValue Lo, Hi;
11028 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11029 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11030}
11031
11033 const GlobalAddressSDNode *GA) const {
11034 // Offsets are folded in the DAG combine rather than here so that we can
11035 // intelligently choose an offset based on the uses.
11036 return false;
11037}
11038
11040 bool OptForSize) const {
11041 bool IsLegal = false;
11042 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11043 // 16-bit case when target has full fp16 support.
11044 // We encode bf16 bit patterns as if they were fp16. This results in very
11045 // strange looking assembly but should populate the register with appropriate
11046 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11047 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11048 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11049 // FIXME: We should be able to handle f128 as well with a clever lowering.
11050 const APInt ImmInt = Imm.bitcastToAPInt();
11051 if (VT == MVT::f64)
11052 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11053 else if (VT == MVT::f32)
11054 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11055 else if (VT == MVT::f16 || VT == MVT::bf16)
11056 IsLegal =
11057 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11058 Imm.isPosZero();
11059
11060 // If we can not materialize in immediate field for fmov, check if the
11061 // value can be encoded as the immediate operand of a logical instruction.
11062 // The immediate value will be created with either MOVZ, MOVN, or ORR.
11063 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
11064 // generate that fmov.
11065 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
11066 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
11067 // however the mov+fmov sequence is always better because of the reduced
11068 // cache pressure. The timings are still the same if you consider
11069 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
11070 // movw+movk is fused). So we limit up to 2 instrdduction at most.
11073 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
11074 IsLegal = Insn.size() <= Limit;
11075 }
11076
11077 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
11078 << " imm value: "; Imm.dump(););
11079 return IsLegal;
11080}
11081
11082//===----------------------------------------------------------------------===//
11083// AArch64 Optimization Hooks
11084//===----------------------------------------------------------------------===//
11085
11086static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
11087 SDValue Operand, SelectionDAG &DAG,
11088 int &ExtraSteps) {
11089 EVT VT = Operand.getValueType();
11090 if ((ST->hasNEON() &&
11091 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
11092 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
11093 VT == MVT::v4f32)) ||
11094 (ST->hasSVE() &&
11095 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
11097 // For the reciprocal estimates, convergence is quadratic, so the number
11098 // of digits is doubled after each iteration. In ARMv8, the accuracy of
11099 // the initial estimate is 2^-8. Thus the number of extra steps to refine
11100 // the result for float (23 mantissa bits) is 2 and for double (52
11101 // mantissa bits) is 3.
11102 constexpr unsigned AccurateBits = 8;
11103 unsigned DesiredBits =
11105 ExtraSteps = DesiredBits <= AccurateBits
11106 ? 0
11107 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
11108 }
11109
11110 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
11111 }
11112
11113 return SDValue();
11114}
11115
11116SDValue
11117AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
11118 const DenormalMode &Mode) const {
11119 SDLoc DL(Op);
11120 EVT VT = Op.getValueType();
11121 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
11122 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
11123 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
11124}
11125
11126SDValue
11127AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
11128 SelectionDAG &DAG) const {
11129 return Op;
11130}
11131
11132SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
11133 SelectionDAG &DAG, int Enabled,
11134 int &ExtraSteps,
11135 bool &UseOneConst,
11136 bool Reciprocal) const {
11138 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
11139 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
11140 DAG, ExtraSteps)) {
11141 SDLoc DL(Operand);
11142 EVT VT = Operand.getValueType();
11143
11145 Flags.setAllowReassociation(true);
11146
11147 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
11148 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
11149 for (int i = ExtraSteps; i > 0; --i) {
11150 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
11151 Flags);
11152 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
11153 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
11154 }
11155 if (!Reciprocal)
11156 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
11157
11158 ExtraSteps = 0;
11159 return Estimate;
11160 }
11161
11162 return SDValue();
11163}
11164
11165SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
11166 SelectionDAG &DAG, int Enabled,
11167 int &ExtraSteps) const {
11169 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
11170 DAG, ExtraSteps)) {
11171 SDLoc DL(Operand);
11172 EVT VT = Operand.getValueType();
11173
11175 Flags.setAllowReassociation(true);
11176
11177 // Newton reciprocal iteration: E * (2 - X * E)
11178 // AArch64 reciprocal iteration instruction: (2 - M * N)
11179 for (int i = ExtraSteps; i > 0; --i) {
11180 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
11181 Estimate, Flags);
11182 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
11183 }
11184
11185 ExtraSteps = 0;
11186 return Estimate;
11187 }
11188
11189 return SDValue();
11190}
11191
11192//===----------------------------------------------------------------------===//
11193// AArch64 Inline Assembly Support
11194//===----------------------------------------------------------------------===//
11195
11196// Table of Constraints
11197// TODO: This is the current set of constraints supported by ARM for the
11198// compiler, not all of them may make sense.
11199//
11200// r - A general register
11201// w - An FP/SIMD register of some size in the range v0-v31
11202// x - An FP/SIMD register of some size in the range v0-v15
11203// I - Constant that can be used with an ADD instruction
11204// J - Constant that can be used with a SUB instruction
11205// K - Constant that can be used with a 32-bit logical instruction
11206// L - Constant that can be used with a 64-bit logical instruction
11207// M - Constant that can be used as a 32-bit MOV immediate
11208// N - Constant that can be used as a 64-bit MOV immediate
11209// Q - A memory reference with base register and no offset
11210// S - A symbolic address
11211// Y - Floating point constant zero
11212// Z - Integer constant zero
11213//
11214// Note that general register operands will be output using their 64-bit x
11215// register name, whatever the size of the variable, unless the asm operand
11216// is prefixed by the %w modifier. Floating-point and SIMD register operands
11217// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
11218// %q modifier.
11219const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
11220 // At this point, we have to lower this constraint to something else, so we
11221 // lower it to an "r" or "w". However, by doing this we will force the result
11222 // to be in register, while the X constraint is much more permissive.
11223 //
11224 // Although we are correct (we are free to emit anything, without
11225 // constraints), we might break use cases that would expect us to be more
11226 // efficient and emit something else.
11227 if (!Subtarget->hasFPARMv8())
11228 return "r";
11229
11230 if (ConstraintVT.isFloatingPoint())
11231 return "w";
11232
11233 if (ConstraintVT.isVector() &&
11234 (ConstraintVT.getSizeInBits() == 64 ||
11235 ConstraintVT.getSizeInBits() == 128))
11236 return "w";
11237
11238 return "r";
11239}
11240
11242
11243static std::optional<PredicateConstraint>
11246 .Case("Uph", PredicateConstraint::Uph)
11247 .Case("Upl", PredicateConstraint::Upl)
11248 .Case("Upa", PredicateConstraint::Upa)
11249 .Default(std::nullopt);
11250}
11251
11252static const TargetRegisterClass *
11254 if (VT != MVT::aarch64svcount &&
11255 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
11256 return nullptr;
11257
11258 switch (Constraint) {
11259 case PredicateConstraint::Uph:
11260 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
11261 : &AArch64::PPR_p8to15RegClass;
11262 case PredicateConstraint::Upl:
11263 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
11264 : &AArch64::PPR_3bRegClass;
11265 case PredicateConstraint::Upa:
11266 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
11267 : &AArch64::PPRRegClass;
11268 }
11269
11270 llvm_unreachable("Missing PredicateConstraint!");
11271}
11272
11274
11275static std::optional<ReducedGprConstraint>
11278 .Case("Uci", ReducedGprConstraint::Uci)
11279 .Case("Ucj", ReducedGprConstraint::Ucj)
11280 .Default(std::nullopt);
11281}
11282
11283static const TargetRegisterClass *
11285 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
11286 return nullptr;
11287
11288 switch (Constraint) {
11289 case ReducedGprConstraint::Uci:
11290 return &AArch64::MatrixIndexGPR32_8_11RegClass;
11291 case ReducedGprConstraint::Ucj:
11292 return &AArch64::MatrixIndexGPR32_12_15RegClass;
11293 }
11294
11295 llvm_unreachable("Missing ReducedGprConstraint!");
11296}
11297
11298// The set of cc code supported is from
11299// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
11302 .Case("{@cchi}", AArch64CC::HI)
11303 .Case("{@cccs}", AArch64CC::HS)
11304 .Case("{@cclo}", AArch64CC::LO)
11305 .Case("{@ccls}", AArch64CC::LS)
11306 .Case("{@cccc}", AArch64CC::LO)
11307 .Case("{@cceq}", AArch64CC::EQ)
11308 .Case("{@ccgt}", AArch64CC::GT)
11309 .Case("{@ccge}", AArch64CC::GE)
11310 .Case("{@cclt}", AArch64CC::LT)
11311 .Case("{@ccle}", AArch64CC::LE)
11312 .Case("{@cchs}", AArch64CC::HS)
11313 .Case("{@ccne}", AArch64CC::NE)
11314 .Case("{@ccvc}", AArch64CC::VC)
11315 .Case("{@ccpl}", AArch64CC::PL)
11316 .Case("{@ccvs}", AArch64CC::VS)
11317 .Case("{@ccmi}", AArch64CC::MI)
11319 return Cond;
11320}
11321
11322/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
11323/// WZR, invert(<cond>)'.
11325 SelectionDAG &DAG) {
11326 return DAG.getNode(
11327 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
11328 DAG.getConstant(0, DL, MVT::i32),
11329 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
11330}
11331
11332// Lower @cc flag output via getSETCC.
11333SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
11334 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
11335 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
11336 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
11337 if (Cond == AArch64CC::Invalid)
11338 return SDValue();
11339 // The output variable should be a scalar integer.
11340 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
11341 OpInfo.ConstraintVT.getSizeInBits() < 8)
11342 report_fatal_error("Flag output operand is of invalid type");
11343
11344 // Get NZCV register. Only update chain when copyfrom is glued.
11345 if (Glue.getNode()) {
11346 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
11347 Chain = Glue.getValue(1);
11348 } else
11349 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
11350 // Extract CC code.
11351 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
11352
11354
11355 // Truncate or ZERO_EXTEND based on value types.
11356 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
11357 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
11358 else
11359 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
11360
11361 return Result;
11362}
11363
11364/// getConstraintType - Given a constraint letter, return the type of
11365/// constraint it is for this target.
11367AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
11368 if (Constraint.size() == 1) {
11369 switch (Constraint[0]) {
11370 default:
11371 break;
11372 case 'x':
11373 case 'w':
11374 case 'y':
11375 return C_RegisterClass;
11376 // An address with a single base register. Due to the way we
11377 // currently handle addresses it is the same as 'r'.
11378 case 'Q':
11379 return C_Memory;
11380 case 'I':
11381 case 'J':
11382 case 'K':
11383 case 'L':
11384 case 'M':
11385 case 'N':
11386 case 'Y':
11387 case 'Z':
11388 return C_Immediate;
11389 case 'z':
11390 case 'S': // A symbol or label reference with a constant offset
11391 return C_Other;
11392 }
11393 } else if (parsePredicateConstraint(Constraint))
11394 return C_RegisterClass;
11395 else if (parseReducedGprConstraint(Constraint))
11396 return C_RegisterClass;
11397 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11398 return C_Other;
11399 return TargetLowering::getConstraintType(Constraint);
11400}
11401
11402/// Examine constraint type and operand type and determine a weight value.
11403/// This object must already have been set up with the operand type
11404/// and the current alternative constraint selected.
11406AArch64TargetLowering::getSingleConstraintMatchWeight(
11407 AsmOperandInfo &info, const char *constraint) const {
11409 Value *CallOperandVal = info.CallOperandVal;
11410 // If we don't have a value, we can't do a match,
11411 // but allow it at the lowest weight.
11412 if (!CallOperandVal)
11413 return CW_Default;
11414 Type *type = CallOperandVal->getType();
11415 // Look at the constraint type.
11416 switch (*constraint) {
11417 default:
11419 break;
11420 case 'x':
11421 case 'w':
11422 case 'y':
11423 if (type->isFloatingPointTy() || type->isVectorTy())
11424 weight = CW_Register;
11425 break;
11426 case 'z':
11427 weight = CW_Constant;
11428 break;
11429 case 'U':
11430 if (parsePredicateConstraint(constraint) ||
11431 parseReducedGprConstraint(constraint))
11432 weight = CW_Register;
11433 break;
11434 }
11435 return weight;
11436}
11437
11438std::pair<unsigned, const TargetRegisterClass *>
11439AArch64TargetLowering::getRegForInlineAsmConstraint(
11440 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11441 if (Constraint.size() == 1) {
11442 switch (Constraint[0]) {
11443 case 'r':
11444 if (VT.isScalableVector())
11445 return std::make_pair(0U, nullptr);
11446 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11447 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11448 if (VT.getFixedSizeInBits() == 64)
11449 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11450 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11451 case 'w': {
11452 if (!Subtarget->hasFPARMv8())
11453 break;
11454 if (VT.isScalableVector()) {
11455 if (VT.getVectorElementType() != MVT::i1)
11456 return std::make_pair(0U, &AArch64::ZPRRegClass);
11457 return std::make_pair(0U, nullptr);
11458 }
11459 uint64_t VTSize = VT.getFixedSizeInBits();
11460 if (VTSize == 16)
11461 return std::make_pair(0U, &AArch64::FPR16RegClass);
11462 if (VTSize == 32)
11463 return std::make_pair(0U, &AArch64::FPR32RegClass);
11464 if (VTSize == 64)
11465 return std::make_pair(0U, &AArch64::FPR64RegClass);
11466 if (VTSize == 128)
11467 return std::make_pair(0U, &AArch64::FPR128RegClass);
11468 break;
11469 }
11470 // The instructions that this constraint is designed for can
11471 // only take 128-bit registers so just use that regclass.
11472 case 'x':
11473 if (!Subtarget->hasFPARMv8())
11474 break;
11475 if (VT.isScalableVector())
11476 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11477 if (VT.getSizeInBits() == 128)
11478 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11479 break;
11480 case 'y':
11481 if (!Subtarget->hasFPARMv8())
11482 break;
11483 if (VT.isScalableVector())
11484 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11485 break;
11486 }
11487 } else {
11488 if (const auto PC = parsePredicateConstraint(Constraint))
11489 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11490 return std::make_pair(0U, RegClass);
11491
11492 if (const auto RGC = parseReducedGprConstraint(Constraint))
11493 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11494 return std::make_pair(0U, RegClass);
11495 }
11496 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11498 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11499
11500 if (Constraint == "{za}") {
11501 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11502 }
11503
11504 if (Constraint == "{zt0}") {
11505 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11506 }
11507
11508 // Use the default implementation in TargetLowering to convert the register
11509 // constraint into a member of a register class.
11510 std::pair<unsigned, const TargetRegisterClass *> Res;
11512
11513 // Not found as a standard register?
11514 if (!Res.second) {
11515 unsigned Size = Constraint.size();
11516 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11517 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11518 int RegNo;
11519 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11520 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11521 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11522 // By default we'll emit v0-v31 for this unless there's a modifier where
11523 // we'll emit the correct register as well.
11524 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11525 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11526 Res.second = &AArch64::FPR64RegClass;
11527 } else {
11528 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11529 Res.second = &AArch64::FPR128RegClass;
11530 }
11531 }
11532 }
11533 }
11534
11535 if (Res.second && !Subtarget->hasFPARMv8() &&
11536 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11537 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11538 return std::make_pair(0U, nullptr);
11539
11540 return Res;
11541}
11542
11544 llvm::Type *Ty,
11545 bool AllowUnknown) const {
11546 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11547 return EVT(MVT::i64x8);
11548
11549 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11550}
11551
11552/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11553/// vector. If it is invalid, don't add anything to Ops.
11554void AArch64TargetLowering::LowerAsmOperandForConstraint(
11555 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11556 SelectionDAG &DAG) const {
11557 SDValue Result;
11558
11559 // Currently only support length 1 constraints.
11560 if (Constraint.size() != 1)
11561 return;
11562
11563 char ConstraintLetter = Constraint[0];
11564 switch (ConstraintLetter) {
11565 default:
11566 break;
11567
11568 // This set of constraints deal with valid constants for various instructions.
11569 // Validate and return a target constant for them if we can.
11570 case 'z': {
11571 // 'z' maps to xzr or wzr so it needs an input of 0.
11572 if (!isNullConstant(Op))
11573 return;
11574
11575 if (Op.getValueType() == MVT::i64)
11576 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11577 else
11578 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11579 break;
11580 }
11581 case 'S':
11582 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11583 // supported for PIC while "s" isn't, making "s" less useful. We implement
11584 // "S" but not "s".
11586 break;
11587
11588 case 'I':
11589 case 'J':
11590 case 'K':
11591 case 'L':
11592 case 'M':
11593 case 'N':
11594 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11595 if (!C)
11596 return;
11597
11598 // Grab the value and do some validation.
11599 uint64_t CVal = C->getZExtValue();
11600 switch (ConstraintLetter) {
11601 // The I constraint applies only to simple ADD or SUB immediate operands:
11602 // i.e. 0 to 4095 with optional shift by 12
11603 // The J constraint applies only to ADD or SUB immediates that would be
11604 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11605 // instruction [or vice versa], in other words -1 to -4095 with optional
11606 // left shift by 12.
11607 case 'I':
11608 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11609 break;
11610 return;
11611 case 'J': {
11612 uint64_t NVal = -C->getSExtValue();
11613 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11614 CVal = C->getSExtValue();
11615 break;
11616 }
11617 return;
11618 }
11619 // The K and L constraints apply *only* to logical immediates, including
11620 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11621 // been removed and MOV should be used). So these constraints have to
11622 // distinguish between bit patterns that are valid 32-bit or 64-bit
11623 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11624 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11625 // versa.
11626 case 'K':
11627 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11628 break;
11629 return;
11630 case 'L':
11631 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11632 break;
11633 return;
11634 // The M and N constraints are a superset of K and L respectively, for use
11635 // with the MOV (immediate) alias. As well as the logical immediates they
11636 // also match 32 or 64-bit immediates that can be loaded either using a
11637 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11638 // (M) or 64-bit 0x1234000000000000 (N) etc.
11639 // As a note some of this code is liberally stolen from the asm parser.
11640 case 'M': {
11641 if (!isUInt<32>(CVal))
11642 return;
11643 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11644 break;
11645 if ((CVal & 0xFFFF) == CVal)
11646 break;
11647 if ((CVal & 0xFFFF0000ULL) == CVal)
11648 break;
11649 uint64_t NCVal = ~(uint32_t)CVal;
11650 if ((NCVal & 0xFFFFULL) == NCVal)
11651 break;
11652 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11653 break;
11654 return;
11655 }
11656 case 'N': {
11657 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11658 break;
11659 if ((CVal & 0xFFFFULL) == CVal)
11660 break;
11661 if ((CVal & 0xFFFF0000ULL) == CVal)
11662 break;
11663 if ((CVal & 0xFFFF00000000ULL) == CVal)
11664 break;
11665 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11666 break;
11667 uint64_t NCVal = ~CVal;
11668 if ((NCVal & 0xFFFFULL) == NCVal)
11669 break;
11670 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11671 break;
11672 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11673 break;
11674 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11675 break;
11676 return;
11677 }
11678 default:
11679 return;
11680 }
11681
11682 // All assembler immediates are 64-bit integers.
11683 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11684 break;
11685 }
11686
11687 if (Result.getNode()) {
11688 Ops.push_back(Result);
11689 return;
11690 }
11691
11692 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11693}
11694
11695//===----------------------------------------------------------------------===//
11696// AArch64 Advanced SIMD Support
11697//===----------------------------------------------------------------------===//
11698
11699/// WidenVector - Given a value in the V64 register class, produce the
11700/// equivalent value in the V128 register class.
11702 EVT VT = V64Reg.getValueType();
11703 unsigned NarrowSize = VT.getVectorNumElements();
11704 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11705 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11706 SDLoc DL(V64Reg);
11707
11708 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11709 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11710}
11711
11712/// getExtFactor - Determine the adjustment factor for the position when
11713/// generating an "extract from vector registers" instruction.
11714static unsigned getExtFactor(SDValue &V) {
11715 EVT EltType = V.getValueType().getVectorElementType();
11716 return EltType.getSizeInBits() / 8;
11717}
11718
11719// Check if a vector is built from one vector via extracted elements of
11720// another together with an AND mask, ensuring that all elements fit
11721// within range. This can be reconstructed using AND and NEON's TBL1.
11723 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11724 SDLoc dl(Op);
11725 EVT VT = Op.getValueType();
11726 assert(!VT.isScalableVector() &&
11727 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11728
11729 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11730 // directly to TBL1.
11731 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11732 return SDValue();
11733
11734 unsigned NumElts = VT.getVectorNumElements();
11735 assert((NumElts == 8 || NumElts == 16) &&
11736 "Need to have exactly 8 or 16 elements in vector.");
11737
11738 SDValue SourceVec;
11739 SDValue MaskSourceVec;
11740 SmallVector<SDValue, 16> AndMaskConstants;
11741
11742 for (unsigned i = 0; i < NumElts; ++i) {
11743 SDValue V = Op.getOperand(i);
11744 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11745 return SDValue();
11746
11747 SDValue OperandSourceVec = V.getOperand(0);
11748 if (!SourceVec)
11749 SourceVec = OperandSourceVec;
11750 else if (SourceVec != OperandSourceVec)
11751 return SDValue();
11752
11753 // This only looks at shuffles with elements that are
11754 // a) truncated by a constant AND mask extracted from a mask vector, or
11755 // b) extracted directly from a mask vector.
11756 SDValue MaskSource = V.getOperand(1);
11757 if (MaskSource.getOpcode() == ISD::AND) {
11758 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11759 return SDValue();
11760
11761 AndMaskConstants.push_back(MaskSource.getOperand(1));
11762 MaskSource = MaskSource->getOperand(0);
11763 } else if (!AndMaskConstants.empty()) {
11764 // Either all or no operands should have an AND mask.
11765 return SDValue();
11766 }
11767
11768 // An ANY_EXTEND may be inserted between the AND and the source vector
11769 // extraction. We don't care about that, so we can just skip it.
11770 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11771 MaskSource = MaskSource.getOperand(0);
11772
11773 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11774 return SDValue();
11775
11776 SDValue MaskIdx = MaskSource.getOperand(1);
11777 if (!isa<ConstantSDNode>(MaskIdx) ||
11778 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11779 return SDValue();
11780
11781 // We only apply this if all elements come from the same vector with the
11782 // same vector type.
11783 if (!MaskSourceVec) {
11784 MaskSourceVec = MaskSource->getOperand(0);
11785 if (MaskSourceVec.getValueType() != VT)
11786 return SDValue();
11787 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11788 return SDValue();
11789 }
11790 }
11791
11792 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11793 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11794 // insert, we know that the index in the mask must be smaller than the number
11795 // of elements in the source, or we would have an out-of-bounds access.
11796 if (NumElts == 8)
11797 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11798 DAG.getUNDEF(VT));
11799
11800 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11801 if (!AndMaskConstants.empty())
11802 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11803 DAG.getBuildVector(VT, dl, AndMaskConstants));
11804
11805 return DAG.getNode(
11807 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11808 MaskSourceVec);
11809}
11810
11811// Gather data to see if the operation can be modelled as a
11812// shuffle in combination with VEXTs.
11814 SelectionDAG &DAG) const {
11815 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11816 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11817 SDLoc dl(Op);
11818 EVT VT = Op.getValueType();
11819 assert(!VT.isScalableVector() &&
11820 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11821 unsigned NumElts = VT.getVectorNumElements();
11822
11823 struct ShuffleSourceInfo {
11824 SDValue Vec;
11825 unsigned MinElt;
11826 unsigned MaxElt;
11827
11828 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11829 // be compatible with the shuffle we intend to construct. As a result
11830 // ShuffleVec will be some sliding window into the original Vec.
11831 SDValue ShuffleVec;
11832
11833 // Code should guarantee that element i in Vec starts at element "WindowBase
11834 // + i * WindowScale in ShuffleVec".
11835 int WindowBase;
11836 int WindowScale;
11837
11838 ShuffleSourceInfo(SDValue Vec)
11839 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11840 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11841
11842 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11843 };
11844
11845 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11846 // node.
11848 for (unsigned i = 0; i < NumElts; ++i) {
11849 SDValue V = Op.getOperand(i);
11850 if (V.isUndef())
11851 continue;
11852 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11853 !isa<ConstantSDNode>(V.getOperand(1)) ||
11854 V.getOperand(0).getValueType().isScalableVector()) {
11855 LLVM_DEBUG(
11856 dbgs() << "Reshuffle failed: "
11857 "a shuffle can only come from building a vector from "
11858 "various elements of other fixed-width vectors, provided "
11859 "their indices are constant\n");
11860 return SDValue();
11861 }
11862
11863 // Add this element source to the list if it's not already there.
11864 SDValue SourceVec = V.getOperand(0);
11865 auto Source = find(Sources, SourceVec);
11866 if (Source == Sources.end())
11867 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11868
11869 // Update the minimum and maximum lane number seen.
11870 unsigned EltNo = V.getConstantOperandVal(1);
11871 Source->MinElt = std::min(Source->MinElt, EltNo);
11872 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11873 }
11874
11875 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11876 // better than moving to/from gpr registers for larger vectors.
11877 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11878 // Construct a mask for the tbl. We may need to adjust the index for types
11879 // larger than i8.
11881 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11882 for (unsigned I = 0; I < NumElts; ++I) {
11883 SDValue V = Op.getOperand(I);
11884 if (V.isUndef()) {
11885 for (unsigned OF = 0; OF < OutputFactor; OF++)
11886 Mask.push_back(-1);
11887 continue;
11888 }
11889 // Set the Mask lanes adjusted for the size of the input and output
11890 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11891 // output element, adjusted in their positions per input and output types.
11892 unsigned Lane = V.getConstantOperandVal(1);
11893 for (unsigned S = 0; S < Sources.size(); S++) {
11894 if (V.getOperand(0) == Sources[S].Vec) {
11895 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11896 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11897 for (unsigned OF = 0; OF < OutputFactor; OF++)
11898 Mask.push_back(InputBase + OF);
11899 break;
11900 }
11901 }
11902 }
11903
11904 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11905 // v16i8, and the TBLMask
11906 SmallVector<SDValue, 16> TBLOperands;
11907 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11908 ? Intrinsic::aarch64_neon_tbl3
11909 : Intrinsic::aarch64_neon_tbl4,
11910 dl, MVT::i32));
11911 for (unsigned i = 0; i < Sources.size(); i++) {
11912 SDValue Src = Sources[i].Vec;
11913 EVT SrcVT = Src.getValueType();
11914 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11915 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11916 "Expected a legally typed vector");
11917 if (SrcVT.is64BitVector())
11918 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11919 DAG.getUNDEF(MVT::v8i8));
11920 TBLOperands.push_back(Src);
11921 }
11922
11924 for (unsigned i = 0; i < Mask.size(); i++)
11925 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11926 assert((Mask.size() == 8 || Mask.size() == 16) &&
11927 "Expected a v8i8 or v16i8 Mask");
11928 TBLOperands.push_back(
11929 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11930
11931 SDValue Shuffle =
11933 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11934 return DAG.getBitcast(VT, Shuffle);
11935 }
11936
11937 if (Sources.size() > 2) {
11938 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11939 << "sensible when at most two source vectors are "
11940 << "involved\n");
11941 return SDValue();
11942 }
11943
11944 // Find out the smallest element size among result and two sources, and use
11945 // it as element size to build the shuffle_vector.
11946 EVT SmallestEltTy = VT.getVectorElementType();
11947 for (auto &Source : Sources) {
11948 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11949 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11950 SmallestEltTy = SrcEltTy;
11951 }
11952 }
11953 unsigned ResMultiplier =
11954 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11955 uint64_t VTSize = VT.getFixedSizeInBits();
11956 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11957 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11958
11959 // If the source vector is too wide or too narrow, we may nevertheless be able
11960 // to construct a compatible shuffle either by concatenating it with UNDEF or
11961 // extracting a suitable range of elements.
11962 for (auto &Src : Sources) {
11963 EVT SrcVT = Src.ShuffleVec.getValueType();
11964
11965 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11966 if (SrcVTSize == TypeSize::getFixed(VTSize))
11967 continue;
11968
11969 // This stage of the search produces a source with the same element type as
11970 // the original, but with a total width matching the BUILD_VECTOR output.
11971 EVT EltVT = SrcVT.getVectorElementType();
11972 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11973 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11974
11975 if (SrcVTSize.getFixedValue() < VTSize) {
11976 assert(2 * SrcVTSize == VTSize);
11977 // We can pad out the smaller vector for free, so if it's part of a
11978 // shuffle...
11979 Src.ShuffleVec =
11980 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11981 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11982 continue;
11983 }
11984
11985 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11986 LLVM_DEBUG(
11987 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11988 return SDValue();
11989 }
11990
11991 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11992 LLVM_DEBUG(
11993 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11994 return SDValue();
11995 }
11996
11997 if (Src.MinElt >= NumSrcElts) {
11998 // The extraction can just take the second half
11999 Src.ShuffleVec =
12000 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12001 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12002 Src.WindowBase = -NumSrcElts;
12003 } else if (Src.MaxElt < NumSrcElts) {
12004 // The extraction can just take the first half
12005 Src.ShuffleVec =
12006 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12007 DAG.getConstant(0, dl, MVT::i64));
12008 } else {
12009 // An actual VEXT is needed
12010 SDValue VEXTSrc1 =
12011 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12012 DAG.getConstant(0, dl, MVT::i64));
12013 SDValue VEXTSrc2 =
12014 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12015 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12016 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12017
12018 if (!SrcVT.is64BitVector()) {
12019 LLVM_DEBUG(
12020 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12021 "for SVE vectors.");
12022 return SDValue();
12023 }
12024
12025 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12026 VEXTSrc2,
12027 DAG.getConstant(Imm, dl, MVT::i32));
12028 Src.WindowBase = -Src.MinElt;
12029 }
12030 }
12031
12032 // Another possible incompatibility occurs from the vector element types. We
12033 // can fix this by bitcasting the source vectors to the same type we intend
12034 // for the shuffle.
12035 for (auto &Src : Sources) {
12036 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
12037 if (SrcEltTy == SmallestEltTy)
12038 continue;
12039 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
12040 if (DAG.getDataLayout().isBigEndian()) {
12041 Src.ShuffleVec =
12042 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
12043 } else {
12044 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
12045 }
12046 Src.WindowScale =
12047 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12048 Src.WindowBase *= Src.WindowScale;
12049 }
12050
12051 // Final check before we try to actually produce a shuffle.
12052 LLVM_DEBUG(for (auto Src
12053 : Sources)
12054 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
12055
12056 // The stars all align, our next step is to produce the mask for the shuffle.
12057 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
12058 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
12059 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
12060 SDValue Entry = Op.getOperand(i);
12061 if (Entry.isUndef())
12062 continue;
12063
12064 auto Src = find(Sources, Entry.getOperand(0));
12065 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
12066
12067 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
12068 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
12069 // segment.
12070 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
12071 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
12072 VT.getScalarSizeInBits());
12073 int LanesDefined = BitsDefined / BitsPerShuffleLane;
12074
12075 // This source is expected to fill ResMultiplier lanes of the final shuffle,
12076 // starting at the appropriate offset.
12077 int *LaneMask = &Mask[i * ResMultiplier];
12078
12079 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
12080 ExtractBase += NumElts * (Src - Sources.begin());
12081 for (int j = 0; j < LanesDefined; ++j)
12082 LaneMask[j] = ExtractBase + j;
12083 }
12084
12085 // Final check before we try to produce nonsense...
12086 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
12087 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
12088 return SDValue();
12089 }
12090
12091 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
12092 for (unsigned i = 0; i < Sources.size(); ++i)
12093 ShuffleOps[i] = Sources[i].ShuffleVec;
12094
12095 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
12096 ShuffleOps[1], Mask);
12097 SDValue V;
12098 if (DAG.getDataLayout().isBigEndian()) {
12099 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
12100 } else {
12101 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
12102 }
12103
12104 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
12105 dbgs() << "Reshuffle, creating node: "; V.dump(););
12106
12107 return V;
12108}
12109
12110// check if an EXT instruction can handle the shuffle mask when the
12111// vector sources of the shuffle are the same.
12112static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
12113 unsigned NumElts = VT.getVectorNumElements();
12114
12115 // Assume that the first shuffle index is not UNDEF. Fail if it is.
12116 if (M[0] < 0)
12117 return false;
12118
12119 Imm = M[0];
12120
12121 // If this is a VEXT shuffle, the immediate value is the index of the first
12122 // element. The other shuffle indices must be the successive elements after
12123 // the first one.
12124 unsigned ExpectedElt = Imm;
12125 for (unsigned i = 1; i < NumElts; ++i) {
12126 // Increment the expected index. If it wraps around, just follow it
12127 // back to index zero and keep going.
12128 ++ExpectedElt;
12129 if (ExpectedElt == NumElts)
12130 ExpectedElt = 0;
12131
12132 if (M[i] < 0)
12133 continue; // ignore UNDEF indices
12134 if (ExpectedElt != static_cast<unsigned>(M[i]))
12135 return false;
12136 }
12137
12138 return true;
12139}
12140
12141// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12142// v4i32s. This is really a truncate, which we can construct out of (legal)
12143// concats and truncate nodes.
12145 if (V.getValueType() != MVT::v16i8)
12146 return SDValue();
12147 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
12148
12149 for (unsigned X = 0; X < 4; X++) {
12150 // Check the first item in each group is an extract from lane 0 of a v4i32
12151 // or v4i16.
12152 SDValue BaseExt = V.getOperand(X * 4);
12153 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12154 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
12155 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
12156 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
12157 BaseExt.getConstantOperandVal(1) != 0)
12158 return SDValue();
12159 SDValue Base = BaseExt.getOperand(0);
12160 // And check the other items are extracts from the same vector.
12161 for (unsigned Y = 1; Y < 4; Y++) {
12162 SDValue Ext = V.getOperand(X * 4 + Y);
12163 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12164 Ext.getOperand(0) != Base ||
12165 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
12166 Ext.getConstantOperandVal(1) != Y)
12167 return SDValue();
12168 }
12169 }
12170
12171 // Turn the buildvector into a series of truncates and concates, which will
12172 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
12173 // concat together to produce 2 v8i16. These are both truncated and concat
12174 // together.
12175 SDLoc DL(V);
12176 SDValue Trunc[4] = {
12177 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
12178 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
12179 for (SDValue &V : Trunc)
12180 if (V.getValueType() == MVT::v4i32)
12181 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
12182 SDValue Concat0 =
12183 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
12184 SDValue Concat1 =
12185 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
12186 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
12187 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
12188 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
12189}
12190
12191/// Check if a vector shuffle corresponds to a DUP instructions with a larger
12192/// element width than the vector lane type. If that is the case the function
12193/// returns true and writes the value of the DUP instruction lane operand into
12194/// DupLaneOp
12195static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
12196 unsigned &DupLaneOp) {
12197 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
12198 "Only possible block sizes for wide DUP are: 16, 32, 64");
12199
12200 if (BlockSize <= VT.getScalarSizeInBits())
12201 return false;
12202 if (BlockSize % VT.getScalarSizeInBits() != 0)
12203 return false;
12204 if (VT.getSizeInBits() % BlockSize != 0)
12205 return false;
12206
12207 size_t SingleVecNumElements = VT.getVectorNumElements();
12208 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
12209 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
12210
12211 // We are looking for masks like
12212 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
12213 // might be replaced by 'undefined'. BlockIndices will eventually contain
12214 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
12215 // for the above examples)
12216 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
12217 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
12218 for (size_t I = 0; I < NumEltsPerBlock; I++) {
12219 int Elt = M[BlockIndex * NumEltsPerBlock + I];
12220 if (Elt < 0)
12221 continue;
12222 // For now we don't support shuffles that use the second operand
12223 if ((unsigned)Elt >= SingleVecNumElements)
12224 return false;
12225 if (BlockElts[I] < 0)
12226 BlockElts[I] = Elt;
12227 else if (BlockElts[I] != Elt)
12228 return false;
12229 }
12230
12231 // We found a candidate block (possibly with some undefs). It must be a
12232 // sequence of consecutive integers starting with a value divisible by
12233 // NumEltsPerBlock with some values possibly replaced by undef-s.
12234
12235 // Find first non-undef element
12236 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
12237 assert(FirstRealEltIter != BlockElts.end() &&
12238 "Shuffle with all-undefs must have been caught by previous cases, "
12239 "e.g. isSplat()");
12240 if (FirstRealEltIter == BlockElts.end()) {
12241 DupLaneOp = 0;
12242 return true;
12243 }
12244
12245 // Index of FirstRealElt in BlockElts
12246 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
12247
12248 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
12249 return false;
12250 // BlockElts[0] must have the following value if it isn't undef:
12251 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
12252
12253 // Check the first element
12254 if (Elt0 % NumEltsPerBlock != 0)
12255 return false;
12256 // Check that the sequence indeed consists of consecutive integers (modulo
12257 // undefs)
12258 for (size_t I = 0; I < NumEltsPerBlock; I++)
12259 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
12260 return false;
12261
12262 DupLaneOp = Elt0 / NumEltsPerBlock;
12263 return true;
12264}
12265
12266// check if an EXT instruction can handle the shuffle mask when the
12267// vector sources of the shuffle are different.
12268static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
12269 unsigned &Imm) {
12270 // Look for the first non-undef element.
12271 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
12272
12273 // Benefit form APInt to handle overflow when calculating expected element.
12274 unsigned NumElts = VT.getVectorNumElements();
12275 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
12276 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
12277 // The following shuffle indices must be the successive elements after the
12278 // first real element.
12279 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
12280 return Elt != ExpectedElt++ && Elt != -1;
12281 });
12282 if (FoundWrongElt)
12283 return false;
12284
12285 // The index of an EXT is the first element if it is not UNDEF.
12286 // Watch out for the beginning UNDEFs. The EXT index should be the expected
12287 // value of the first element. E.g.
12288 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
12289 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
12290 // ExpectedElt is the last mask index plus 1.
12291 Imm = ExpectedElt.getZExtValue();
12292
12293 // There are two difference cases requiring to reverse input vectors.
12294 // For example, for vector <4 x i32> we have the following cases,
12295 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
12296 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
12297 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
12298 // to reverse two input vectors.
12299 if (Imm < NumElts)
12300 ReverseEXT = true;
12301 else
12302 Imm -= NumElts;
12303
12304 return true;
12305}
12306
12307/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
12308/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12309/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
12310static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12311 unsigned NumElts = VT.getVectorNumElements();
12312 if (NumElts % 2 != 0)
12313 return false;
12314 WhichResult = (M[0] == 0 ? 0 : 1);
12315 unsigned Idx = WhichResult * NumElts / 2;
12316 for (unsigned i = 0; i != NumElts; i += 2) {
12317 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
12318 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
12319 return false;
12320 Idx += 1;
12321 }
12322
12323 return true;
12324}
12325
12326/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
12327/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12328/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
12329static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12330 unsigned Half = VT.getVectorNumElements() / 2;
12331 WhichResult = (M[0] == 0 ? 0 : 1);
12332 for (unsigned j = 0; j != 2; ++j) {
12333 unsigned Idx = WhichResult;
12334 for (unsigned i = 0; i != Half; ++i) {
12335 int MIdx = M[i + j * Half];
12336 if (MIdx >= 0 && (unsigned)MIdx != Idx)
12337 return false;
12338 Idx += 2;
12339 }
12340 }
12341
12342 return true;
12343}
12344
12345/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
12346/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12347/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
12348static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12349 unsigned NumElts = VT.getVectorNumElements();
12350 if (NumElts % 2 != 0)
12351 return false;
12352 WhichResult = (M[0] == 0 ? 0 : 1);
12353 for (unsigned i = 0; i < NumElts; i += 2) {
12354 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
12355 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
12356 return false;
12357 }
12358 return true;
12359}
12360
12361static bool isINSMask(ArrayRef<int> M, int NumInputElements,
12362 bool &DstIsLeft, int &Anomaly) {
12363 if (M.size() != static_cast<size_t>(NumInputElements))
12364 return false;
12365
12366 int NumLHSMatch = 0, NumRHSMatch = 0;
12367 int LastLHSMismatch = -1, LastRHSMismatch = -1;
12368
12369 for (int i = 0; i < NumInputElements; ++i) {
12370 if (M[i] == -1) {
12371 ++NumLHSMatch;
12372 ++NumRHSMatch;
12373 continue;
12374 }
12375
12376 if (M[i] == i)
12377 ++NumLHSMatch;
12378 else
12379 LastLHSMismatch = i;
12380
12381 if (M[i] == i + NumInputElements)
12382 ++NumRHSMatch;
12383 else
12384 LastRHSMismatch = i;
12385 }
12386
12387 if (NumLHSMatch == NumInputElements - 1) {
12388 DstIsLeft = true;
12389 Anomaly = LastLHSMismatch;
12390 return true;
12391 } else if (NumRHSMatch == NumInputElements - 1) {
12392 DstIsLeft = false;
12393 Anomaly = LastRHSMismatch;
12394 return true;
12395 }
12396
12397 return false;
12398}
12399
12400static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12401 if (VT.getSizeInBits() != 128)
12402 return false;
12403
12404 unsigned NumElts = VT.getVectorNumElements();
12405
12406 for (int I = 0, E = NumElts / 2; I != E; I++) {
12407 if (Mask[I] != I)
12408 return false;
12409 }
12410
12411 int Offset = NumElts / 2;
12412 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12413 if (Mask[I] != I + SplitLHS * Offset)
12414 return false;
12415 }
12416
12417 return true;
12418}
12419
12421 SDLoc DL(Op);
12422 EVT VT = Op.getValueType();
12423 SDValue V0 = Op.getOperand(0);
12424 SDValue V1 = Op.getOperand(1);
12425 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12426
12429 return SDValue();
12430
12431 bool SplitV0 = V0.getValueSizeInBits() == 128;
12432
12433 if (!isConcatMask(Mask, VT, SplitV0))
12434 return SDValue();
12435
12436 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12437 if (SplitV0) {
12438 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12439 DAG.getConstant(0, DL, MVT::i64));
12440 }
12441 if (V1.getValueSizeInBits() == 128) {
12442 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12443 DAG.getConstant(0, DL, MVT::i64));
12444 }
12445 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12446}
12447
12448/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12449/// the specified operations to build the shuffle. ID is the perfect-shuffle
12450//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12451//table entry and LHS/RHS are the immediate inputs for this stage of the
12452//shuffle.
12454 SDValue V2, unsigned PFEntry, SDValue LHS,
12455 SDValue RHS, SelectionDAG &DAG,
12456 const SDLoc &dl) {
12457 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12458 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12459 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12460
12461 enum {
12462 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12463 OP_VREV,
12464 OP_VDUP0,
12465 OP_VDUP1,
12466 OP_VDUP2,
12467 OP_VDUP3,
12468 OP_VEXT1,
12469 OP_VEXT2,
12470 OP_VEXT3,
12471 OP_VUZPL, // VUZP, left result
12472 OP_VUZPR, // VUZP, right result
12473 OP_VZIPL, // VZIP, left result
12474 OP_VZIPR, // VZIP, right result
12475 OP_VTRNL, // VTRN, left result
12476 OP_VTRNR, // VTRN, right result
12477 OP_MOVLANE // Move lane. RHSID is the lane to move into
12478 };
12479
12480 if (OpNum == OP_COPY) {
12481 if (LHSID == (1 * 9 + 2) * 9 + 3)
12482 return LHS;
12483 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12484 return RHS;
12485 }
12486
12487 if (OpNum == OP_MOVLANE) {
12488 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12489 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12490 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12491 Elt = 3 - Elt;
12492 while (Elt > 0) {
12493 ID /= 9;
12494 Elt--;
12495 }
12496 return (ID % 9 == 8) ? -1 : ID % 9;
12497 };
12498
12499 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12500 // get the lane to move from the PFID, which is always from the
12501 // original vectors (V1 or V2).
12503 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12504 EVT VT = OpLHS.getValueType();
12505 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12506 unsigned ExtLane = 0;
12507 SDValue Input;
12508
12509 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12510 // convert into a higher type.
12511 if (RHSID & 0x4) {
12512 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12513 if (MaskElt == -1)
12514 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12515 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12516 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12517 Input = MaskElt < 2 ? V1 : V2;
12518 if (VT.getScalarSizeInBits() == 16) {
12519 Input = DAG.getBitcast(MVT::v2f32, Input);
12520 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12521 } else {
12522 assert(VT.getScalarSizeInBits() == 32 &&
12523 "Expected 16 or 32 bit shuffle elemements");
12524 Input = DAG.getBitcast(MVT::v2f64, Input);
12525 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12526 }
12527 } else {
12528 int MaskElt = getPFIDLane(ID, RHSID);
12529 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12530 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12531 Input = MaskElt < 4 ? V1 : V2;
12532 // Be careful about creating illegal types. Use f16 instead of i16.
12533 if (VT == MVT::v4i16) {
12534 Input = DAG.getBitcast(MVT::v4f16, Input);
12535 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12536 }
12537 }
12540 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12541 SDValue Ins =
12542 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12543 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12544 return DAG.getBitcast(VT, Ins);
12545 }
12546
12547 SDValue OpLHS, OpRHS;
12548 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12549 RHS, DAG, dl);
12550 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12551 RHS, DAG, dl);
12552 EVT VT = OpLHS.getValueType();
12553
12554 switch (OpNum) {
12555 default:
12556 llvm_unreachable("Unknown shuffle opcode!");
12557 case OP_VREV:
12558 // VREV divides the vector in half and swaps within the half.
12559 if (VT.getVectorElementType() == MVT::i32 ||
12560 VT.getVectorElementType() == MVT::f32)
12561 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12562 // vrev <4 x i16> -> REV32
12563 if (VT.getVectorElementType() == MVT::i16 ||
12564 VT.getVectorElementType() == MVT::f16 ||
12565 VT.getVectorElementType() == MVT::bf16)
12566 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12567 // vrev <4 x i8> -> REV16
12568 assert(VT.getVectorElementType() == MVT::i8);
12569 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12570 case OP_VDUP0:
12571 case OP_VDUP1:
12572 case OP_VDUP2:
12573 case OP_VDUP3: {
12574 EVT EltTy = VT.getVectorElementType();
12575 unsigned Opcode;
12576 if (EltTy == MVT::i8)
12577 Opcode = AArch64ISD::DUPLANE8;
12578 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12579 Opcode = AArch64ISD::DUPLANE16;
12580 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12581 Opcode = AArch64ISD::DUPLANE32;
12582 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12583 Opcode = AArch64ISD::DUPLANE64;
12584 else
12585 llvm_unreachable("Invalid vector element type?");
12586
12587 if (VT.getSizeInBits() == 64)
12588 OpLHS = WidenVector(OpLHS, DAG);
12589 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12590 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12591 }
12592 case OP_VEXT1:
12593 case OP_VEXT2:
12594 case OP_VEXT3: {
12595 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12596 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12597 DAG.getConstant(Imm, dl, MVT::i32));
12598 }
12599 case OP_VUZPL:
12600 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12601 case OP_VUZPR:
12602 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12603 case OP_VZIPL:
12604 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12605 case OP_VZIPR:
12606 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12607 case OP_VTRNL:
12608 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12609 case OP_VTRNR:
12610 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12611 }
12612}
12613
12615 SelectionDAG &DAG) {
12616 // Check to see if we can use the TBL instruction.
12617 SDValue V1 = Op.getOperand(0);
12618 SDValue V2 = Op.getOperand(1);
12619 SDLoc DL(Op);
12620
12621 EVT EltVT = Op.getValueType().getVectorElementType();
12622 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12623
12624 bool Swap = false;
12625 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12626 std::swap(V1, V2);
12627 Swap = true;
12628 }
12629
12630 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12631 // out of range values with 0s. We do need to make sure that any out-of-range
12632 // values are really out-of-range for a v16i8 vector.
12633 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12634 MVT IndexVT = MVT::v8i8;
12635 unsigned IndexLen = 8;
12636 if (Op.getValueSizeInBits() == 128) {
12637 IndexVT = MVT::v16i8;
12638 IndexLen = 16;
12639 }
12640
12642 for (int Val : ShuffleMask) {
12643 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12644 unsigned Offset = Byte + Val * BytesPerElt;
12645 if (Swap)
12646 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12647 if (IsUndefOrZero && Offset >= IndexLen)
12648 Offset = 255;
12649 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12650 }
12651 }
12652
12653 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12654 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12655
12656 SDValue Shuffle;
12657 if (IsUndefOrZero) {
12658 if (IndexLen == 8)
12659 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12660 Shuffle = DAG.getNode(
12661 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12662 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12663 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12664 } else {
12665 if (IndexLen == 8) {
12666 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12667 Shuffle = DAG.getNode(
12668 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12669 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12670 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12671 } else {
12672 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12673 // cannot currently represent the register constraints on the input
12674 // table registers.
12675 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12676 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12677 // IndexLen));
12678 Shuffle = DAG.getNode(
12679 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12680 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12681 V2Cst,
12682 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12683 }
12684 }
12685 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12686}
12687
12688static unsigned getDUPLANEOp(EVT EltType) {
12689 if (EltType == MVT::i8)
12690 return AArch64ISD::DUPLANE8;
12691 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12692 return AArch64ISD::DUPLANE16;
12693 if (EltType == MVT::i32 || EltType == MVT::f32)
12694 return AArch64ISD::DUPLANE32;
12695 if (EltType == MVT::i64 || EltType == MVT::f64)
12696 return AArch64ISD::DUPLANE64;
12697
12698 llvm_unreachable("Invalid vector element type?");
12699}
12700
12701static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12702 unsigned Opcode, SelectionDAG &DAG) {
12703 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12704 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12705 // Match: dup (bitcast (extract_subv X, C)), LaneC
12706 if (BitCast.getOpcode() != ISD::BITCAST ||
12708 return false;
12709
12710 // The extract index must align in the destination type. That may not
12711 // happen if the bitcast is from narrow to wide type.
12712 SDValue Extract = BitCast.getOperand(0);
12713 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12714 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12715 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12716 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12717 if (ExtIdxInBits % CastedEltBitWidth != 0)
12718 return false;
12719
12720 // Can't handle cases where vector size is not 128-bit
12721 if (!Extract.getOperand(0).getValueType().is128BitVector())
12722 return false;
12723
12724 // Update the lane value by offsetting with the scaled extract index.
12725 LaneC += ExtIdxInBits / CastedEltBitWidth;
12726
12727 // Determine the casted vector type of the wide vector input.
12728 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12729 // Examples:
12730 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12731 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12732 unsigned SrcVecNumElts =
12733 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12735 SrcVecNumElts);
12736 return true;
12737 };
12738 MVT CastVT;
12739 if (getScaledOffsetDup(V, Lane, CastVT)) {
12740 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12741 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12742 V.getOperand(0).getValueType().is128BitVector()) {
12743 // The lane is incremented by the index of the extract.
12744 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12745 Lane += V.getConstantOperandVal(1);
12746 V = V.getOperand(0);
12747 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12748 // The lane is decremented if we are splatting from the 2nd operand.
12749 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12750 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12751 Lane -= Idx * VT.getVectorNumElements() / 2;
12752 V = WidenVector(V.getOperand(Idx), DAG);
12753 } else if (VT.getSizeInBits() == 64) {
12754 // Widen the operand to 128-bit register with undef.
12755 V = WidenVector(V, DAG);
12756 }
12757 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12758}
12759
12760// Return true if we can get a new shuffle mask by checking the parameter mask
12761// array to test whether every two adjacent mask values are continuous and
12762// starting from an even number.
12764 SmallVectorImpl<int> &NewMask) {
12765 unsigned NumElts = VT.getVectorNumElements();
12766 if (NumElts % 2 != 0)
12767 return false;
12768
12769 NewMask.clear();
12770 for (unsigned i = 0; i < NumElts; i += 2) {
12771 int M0 = M[i];
12772 int M1 = M[i + 1];
12773
12774 // If both elements are undef, new mask is undef too.
12775 if (M0 == -1 && M1 == -1) {
12776 NewMask.push_back(-1);
12777 continue;
12778 }
12779
12780 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12781 NewMask.push_back(M1 / 2);
12782 continue;
12783 }
12784
12785 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12786 NewMask.push_back(M0 / 2);
12787 continue;
12788 }
12789
12790 NewMask.clear();
12791 return false;
12792 }
12793
12794 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12795 return true;
12796}
12797
12798// Try to widen element type to get a new mask value for a better permutation
12799// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12800// UZP1/2, TRN1/2, REV, INS, etc.
12801// For example:
12802// shufflevector <4 x i32> %a, <4 x i32> %b,
12803// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12804// is equivalent to:
12805// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12806// Finally, we can get:
12807// mov v0.d[0], v1.d[1]
12809 SDLoc DL(Op);
12810 EVT VT = Op.getValueType();
12811 EVT ScalarVT = VT.getVectorElementType();
12812 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12813 SDValue V0 = Op.getOperand(0);
12814 SDValue V1 = Op.getOperand(1);
12815 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12816
12817 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12818 // We need to make sure the wider element type is legal. Thus, ElementSize
12819 // should be not larger than 32 bits, and i1 type should also be excluded.
12820 if (ElementSize > 32 || ElementSize == 1)
12821 return SDValue();
12822
12823 SmallVector<int, 8> NewMask;
12824 if (isWideTypeMask(Mask, VT, NewMask)) {
12825 MVT NewEltVT = VT.isFloatingPoint()
12826 ? MVT::getFloatingPointVT(ElementSize * 2)
12827 : MVT::getIntegerVT(ElementSize * 2);
12828 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12829 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12830 V0 = DAG.getBitcast(NewVT, V0);
12831 V1 = DAG.getBitcast(NewVT, V1);
12832 return DAG.getBitcast(VT,
12833 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12834 }
12835 }
12836
12837 return SDValue();
12838}
12839
12840// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12842 ArrayRef<int> ShuffleMask,
12843 SelectionDAG &DAG) {
12844 SDValue Tbl1 = Op->getOperand(0);
12845 SDValue Tbl2 = Op->getOperand(1);
12846 SDLoc dl(Op);
12847 SDValue Tbl2ID =
12848 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12849
12850 EVT VT = Op.getValueType();
12851 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12852 Tbl1->getOperand(0) != Tbl2ID ||
12854 Tbl2->getOperand(0) != Tbl2ID)
12855 return SDValue();
12856
12857 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12858 Tbl2->getValueType(0) != MVT::v16i8)
12859 return SDValue();
12860
12861 SDValue Mask1 = Tbl1->getOperand(3);
12862 SDValue Mask2 = Tbl2->getOperand(3);
12863 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12864 for (unsigned I = 0; I < 16; I++) {
12865 if (ShuffleMask[I] < 16)
12866 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12867 else {
12868 auto *C =
12869 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12870 if (!C)
12871 return SDValue();
12872 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12873 }
12874 }
12875
12876 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12877 SDValue ID =
12878 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12879
12880 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12881 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12882 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12883}
12884
12885// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12886// but we don't have an appropriate instruction,
12887// so custom-lower it as ZIP1-with-zeros.
12888SDValue
12889AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12890 SelectionDAG &DAG) const {
12891 SDLoc dl(Op);
12892 EVT VT = Op.getValueType();
12893 SDValue SrcOp = Op.getOperand(0);
12894 EVT SrcVT = SrcOp.getValueType();
12895 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12896 "Unexpected extension factor.");
12897 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12898 // FIXME: support multi-step zipping?
12899 if (Scale != 2)
12900 return SDValue();
12901 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12902 return DAG.getBitcast(VT,
12903 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12904}
12905
12906SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12907 SelectionDAG &DAG) const {
12908 SDLoc dl(Op);
12909 EVT VT = Op.getValueType();
12910
12911 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12912
12913 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12914 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12915
12916 // Convert shuffles that are directly supported on NEON to target-specific
12917 // DAG nodes, instead of keeping them as shuffles and matching them again
12918 // during code selection. This is more efficient and avoids the possibility
12919 // of inconsistencies between legalization and selection.
12920 ArrayRef<int> ShuffleMask = SVN->getMask();
12921
12922 SDValue V1 = Op.getOperand(0);
12923 SDValue V2 = Op.getOperand(1);
12924
12925 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12926 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12927 "Unexpected VECTOR_SHUFFLE mask size!");
12928
12929 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12930 return Res;
12931
12932 if (SVN->isSplat()) {
12933 int Lane = SVN->getSplatIndex();
12934 // If this is undef splat, generate it via "just" vdup, if possible.
12935 if (Lane == -1)
12936 Lane = 0;
12937
12938 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12939 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12940 V1.getOperand(0));
12941 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12942 // constant. If so, we can just reference the lane's definition directly.
12943 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12944 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12945 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12946
12947 // Otherwise, duplicate from the lane of the input vector.
12948 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12949 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12950 }
12951
12952 // Check if the mask matches a DUP for a wider element
12953 for (unsigned LaneSize : {64U, 32U, 16U}) {
12954 unsigned Lane = 0;
12955 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12956 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12957 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12959 // Cast V1 to an integer vector with required lane size
12960 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12961 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12962 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12963 V1 = DAG.getBitcast(NewVecTy, V1);
12964 // Constuct the DUP instruction
12965 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12966 // Cast back to the original type
12967 return DAG.getBitcast(VT, V1);
12968 }
12969 }
12970
12971 unsigned NumElts = VT.getVectorNumElements();
12972 unsigned EltSize = VT.getScalarSizeInBits();
12973 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
12974 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12975 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
12976 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12977 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
12978 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12979
12980 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
12981 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12982 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12983 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12984 DAG.getConstant(8, dl, MVT::i32));
12985 }
12986
12987 bool ReverseEXT = false;
12988 unsigned Imm;
12989 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12990 if (ReverseEXT)
12991 std::swap(V1, V2);
12992 Imm *= getExtFactor(V1);
12993 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12994 DAG.getConstant(Imm, dl, MVT::i32));
12995 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12996 Imm *= getExtFactor(V1);
12997 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12998 DAG.getConstant(Imm, dl, MVT::i32));
12999 }
13000
13001 unsigned WhichResult;
13002 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13003 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13004 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13005 }
13006 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13007 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13008 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13009 }
13010 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13011 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13012 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13013 }
13014
13015 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13016 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13017 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13018 }
13019 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13020 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13021 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13022 }
13023 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13024 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13025 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13026 }
13027
13029 return Concat;
13030
13031 bool DstIsLeft;
13032 int Anomaly;
13033 int NumInputElements = V1.getValueType().getVectorNumElements();
13034 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13035 SDValue DstVec = DstIsLeft ? V1 : V2;
13036 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13037
13038 SDValue SrcVec = V1;
13039 int SrcLane = ShuffleMask[Anomaly];
13040 if (SrcLane >= NumInputElements) {
13041 SrcVec = V2;
13042 SrcLane -= NumElts;
13043 }
13044 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13045
13046 EVT ScalarVT = VT.getVectorElementType();
13047
13048 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13049 ScalarVT = MVT::i32;
13050
13051 return DAG.getNode(
13052 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13053 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13054 DstLaneV);
13055 }
13056
13057 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13058 return NewSD;
13059
13060 // If the shuffle is not directly supported and it has 4 elements, use
13061 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13062 if (NumElts == 4) {
13063 unsigned PFIndexes[4];
13064 for (unsigned i = 0; i != 4; ++i) {
13065 if (ShuffleMask[i] < 0)
13066 PFIndexes[i] = 8;
13067 else
13068 PFIndexes[i] = ShuffleMask[i];
13069 }
13070
13071 // Compute the index in the perfect shuffle table.
13072 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
13073 PFIndexes[2] * 9 + PFIndexes[3];
13074 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
13075 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
13076 dl);
13077 }
13078
13079 return GenerateTBL(Op, ShuffleMask, DAG);
13080}
13081
13082SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
13083 SelectionDAG &DAG) const {
13084 EVT VT = Op.getValueType();
13085
13086 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13087 return LowerToScalableOp(Op, DAG);
13088
13089 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
13090 "Unexpected vector type!");
13091
13092 // We can handle the constant cases during isel.
13093 if (isa<ConstantSDNode>(Op.getOperand(0)))
13094 return Op;
13095
13096 // There isn't a natural way to handle the general i1 case, so we use some
13097 // trickery with whilelo.
13098 SDLoc DL(Op);
13099 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
13100 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
13101 DAG.getValueType(MVT::i1));
13102 SDValue ID =
13103 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
13104 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13105 if (VT == MVT::nxv1i1)
13106 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
13107 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
13108 Zero, SplatVal),
13109 Zero);
13110 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
13111}
13112
13113SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
13114 SelectionDAG &DAG) const {
13115 SDLoc DL(Op);
13116
13117 EVT VT = Op.getValueType();
13118 if (!isTypeLegal(VT) || !VT.isScalableVector())
13119 return SDValue();
13120
13121 // Current lowering only supports the SVE-ACLE types.
13123 return SDValue();
13124
13125 // The DUPQ operation is indepedent of element type so normalise to i64s.
13126 SDValue Idx128 = Op.getOperand(2);
13127
13128 // DUPQ can be used when idx is in range.
13129 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
13130 if (CIdx && (CIdx->getZExtValue() <= 3)) {
13131 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
13132 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
13133 }
13134
13135 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
13136
13137 // The ACLE says this must produce the same result as:
13138 // svtbl(data, svadd_x(svptrue_b64(),
13139 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
13140 // index * 2))
13141 SDValue One = DAG.getConstant(1, DL, MVT::i64);
13142 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
13143
13144 // create the vector 0,1,0,1,...
13145 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
13146 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
13147
13148 // create the vector idx64,idx64+1,idx64,idx64+1,...
13149 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
13150 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
13151 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
13152
13153 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
13154 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
13155 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
13156}
13157
13158
13159static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
13160 APInt &UndefBits) {
13161 EVT VT = BVN->getValueType(0);
13162 APInt SplatBits, SplatUndef;
13163 unsigned SplatBitSize;
13164 bool HasAnyUndefs;
13165 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13166 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
13167
13168 for (unsigned i = 0; i < NumSplats; ++i) {
13169 CnstBits <<= SplatBitSize;
13170 UndefBits <<= SplatBitSize;
13171 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
13172 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
13173 }
13174
13175 return true;
13176 }
13177
13178 return false;
13179}
13180
13181// Try 64-bit splatted SIMD immediate.
13182static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13183 const APInt &Bits) {
13184 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13185 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13186 EVT VT = Op.getValueType();
13187 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
13188
13191
13192 SDLoc dl(Op);
13193 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13194 DAG.getConstant(Value, dl, MVT::i32));
13195 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13196 }
13197 }
13198
13199 return SDValue();
13200}
13201
13202// Try 32-bit splatted SIMD immediate.
13203static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13204 const APInt &Bits,
13205 const SDValue *LHS = nullptr) {
13206 EVT VT = Op.getValueType();
13207 if (VT.isFixedLengthVector() &&
13209 return SDValue();
13210
13211 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13212 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13213 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13214 bool isAdvSIMDModImm = false;
13215 uint64_t Shift;
13216
13217 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
13219 Shift = 0;
13220 }
13221 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
13223 Shift = 8;
13224 }
13225 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
13227 Shift = 16;
13228 }
13229 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
13231 Shift = 24;
13232 }
13233
13234 if (isAdvSIMDModImm) {
13235 SDLoc dl(Op);
13236 SDValue Mov;
13237
13238 if (LHS)
13239 Mov = DAG.getNode(NewOp, dl, MovTy,
13240 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
13241 DAG.getConstant(Value, dl, MVT::i32),
13242 DAG.getConstant(Shift, dl, MVT::i32));
13243 else
13244 Mov = DAG.getNode(NewOp, dl, MovTy,
13245 DAG.getConstant(Value, dl, MVT::i32),
13246 DAG.getConstant(Shift, dl, MVT::i32));
13247
13248 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13249 }
13250 }
13251
13252 return SDValue();
13253}
13254
13255// Try 16-bit splatted SIMD immediate.
13256static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13257 const APInt &Bits,
13258 const SDValue *LHS = nullptr) {
13259 EVT VT = Op.getValueType();
13260 if (VT.isFixedLengthVector() &&
13262 return SDValue();
13263
13264 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13265 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13266 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
13267 bool isAdvSIMDModImm = false;
13268 uint64_t Shift;
13269
13270 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
13272 Shift = 0;
13273 }
13274 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
13276 Shift = 8;
13277 }
13278
13279 if (isAdvSIMDModImm) {
13280 SDLoc dl(Op);
13281 SDValue Mov;
13282
13283 if (LHS)
13284 Mov = DAG.getNode(NewOp, dl, MovTy,
13285 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
13286 DAG.getConstant(Value, dl, MVT::i32),
13287 DAG.getConstant(Shift, dl, MVT::i32));
13288 else
13289 Mov = DAG.getNode(NewOp, dl, MovTy,
13290 DAG.getConstant(Value, dl, MVT::i32),
13291 DAG.getConstant(Shift, dl, MVT::i32));
13292
13293 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13294 }
13295 }
13296
13297 return SDValue();
13298}
13299
13300// Try 32-bit splatted SIMD immediate with shifted ones.
13302 SelectionDAG &DAG, const APInt &Bits) {
13303 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13304 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13305 EVT VT = Op.getValueType();
13306 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13307 bool isAdvSIMDModImm = false;
13308 uint64_t Shift;
13309
13310 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
13312 Shift = 264;
13313 }
13314 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
13316 Shift = 272;
13317 }
13318
13319 if (isAdvSIMDModImm) {
13320 SDLoc dl(Op);
13321 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13322 DAG.getConstant(Value, dl, MVT::i32),
13323 DAG.getConstant(Shift, dl, MVT::i32));
13324 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13325 }
13326 }
13327
13328 return SDValue();
13329}
13330
13331// Try 8-bit splatted SIMD immediate.
13332static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13333 const APInt &Bits) {
13334 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13335 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13336 EVT VT = Op.getValueType();
13337 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
13338
13341
13342 SDLoc dl(Op);
13343 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13344 DAG.getConstant(Value, dl, MVT::i32));
13345 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13346 }
13347 }
13348
13349 return SDValue();
13350}
13351
13352// Try FP splatted SIMD immediate.
13353static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13354 const APInt &Bits) {
13355 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13356 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13357 EVT VT = Op.getValueType();
13358 bool isWide = (VT.getSizeInBits() == 128);
13359 MVT MovTy;
13360 bool isAdvSIMDModImm = false;
13361
13362 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
13364 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
13365 }
13366 else if (isWide &&
13367 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
13369 MovTy = MVT::v2f64;
13370 }
13371
13372 if (isAdvSIMDModImm) {
13373 SDLoc dl(Op);
13374 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13375 DAG.getConstant(Value, dl, MVT::i32));
13376 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13377 }
13378 }
13379
13380 return SDValue();
13381}
13382
13383// Specialized code to quickly find if PotentialBVec is a BuildVector that
13384// consists of only the same constant int value, returned in reference arg
13385// ConstVal
13386static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13387 uint64_t &ConstVal) {
13388 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13389 if (!Bvec)
13390 return false;
13391 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13392 if (!FirstElt)
13393 return false;
13394 EVT VT = Bvec->getValueType(0);
13395 unsigned NumElts = VT.getVectorNumElements();
13396 for (unsigned i = 1; i < NumElts; ++i)
13397 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13398 return false;
13399 ConstVal = FirstElt->getZExtValue();
13400 return true;
13401}
13402
13404 // Look through cast.
13405 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13406 N = N.getOperand(0);
13407
13408 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13409}
13410
13412 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13413
13414 // Look through cast.
13415 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13416 N = N.getOperand(0);
13417 // When reinterpreting from a type with fewer elements the "new" elements
13418 // are not active, so bail if they're likely to be used.
13419 if (N.getValueType().getVectorMinNumElements() < NumElts)
13420 return false;
13421 }
13422
13423 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13424 return true;
13425
13426 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13427 // or smaller than the implicit element type represented by N.
13428 // NOTE: A larger element count implies a smaller element type.
13429 if (N.getOpcode() == AArch64ISD::PTRUE &&
13430 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13431 return N.getValueType().getVectorMinNumElements() >= NumElts;
13432
13433 // If we're compiling for a specific vector-length, we can check if the
13434 // pattern's VL equals that of the scalable vector at runtime.
13435 if (N.getOpcode() == AArch64ISD::PTRUE) {
13436 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13437 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13438 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13439 if (MaxSVESize && MinSVESize == MaxSVESize) {
13440 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13441 unsigned PatNumElts =
13442 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13443 return PatNumElts == (NumElts * VScale);
13444 }
13445 }
13446
13447 return false;
13448}
13449
13450// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13451// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13452// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13453// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13454// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13455// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13457 EVT VT = N->getValueType(0);
13458
13459 if (!VT.isVector())
13460 return SDValue();
13461
13462 SDLoc DL(N);
13463
13464 SDValue And;
13465 SDValue Shift;
13466
13467 SDValue FirstOp = N->getOperand(0);
13468 unsigned FirstOpc = FirstOp.getOpcode();
13469 SDValue SecondOp = N->getOperand(1);
13470 unsigned SecondOpc = SecondOp.getOpcode();
13471
13472 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13473 // a BICi in order to use an immediate instead of a register.
13474 // Is the other operand an shl or lshr? This will have been turned into:
13475 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13476 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13477 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13478 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13479 SecondOpc == AArch64ISD::SHL_PRED ||
13480 SecondOpc == AArch64ISD::SRL_PRED)) {
13481 And = FirstOp;
13482 Shift = SecondOp;
13483
13484 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13485 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13486 FirstOpc == AArch64ISD::SHL_PRED ||
13487 FirstOpc == AArch64ISD::SRL_PRED)) {
13488 And = SecondOp;
13489 Shift = FirstOp;
13490 } else
13491 return SDValue();
13492
13493 bool IsAnd = And.getOpcode() == ISD::AND;
13494 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13496 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13498
13499 // Is the shift amount constant and are all lanes active?
13500 uint64_t C2;
13501 if (ShiftHasPredOp) {
13502 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13503 return SDValue();
13504 APInt C;
13506 return SDValue();
13507 C2 = C.getZExtValue();
13508 } else if (ConstantSDNode *C2node =
13509 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13510 C2 = C2node->getZExtValue();
13511 else
13512 return SDValue();
13513
13514 APInt C1AsAPInt;
13515 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13516 if (IsAnd) {
13517 // Is the and mask vector all constant?
13518 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13519 return SDValue();
13520 } else {
13521 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13522 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13523 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13524 assert(C1nodeImm && C1nodeShift);
13525 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13526 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13527 }
13528
13529 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13530 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13531 // how much one can shift elements of a particular size?
13532 if (C2 > ElemSizeInBits)
13533 return SDValue();
13534
13535 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13536 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13537 if (C1AsAPInt != RequiredC1)
13538 return SDValue();
13539
13540 SDValue X = And.getOperand(0);
13541 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13542 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13543 : Shift.getOperand(1);
13544
13545 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13546 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13547
13548 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13549 LLVM_DEBUG(N->dump(&DAG));
13550 LLVM_DEBUG(dbgs() << "into: \n");
13551 LLVM_DEBUG(ResultSLI->dump(&DAG));
13552
13553 ++NumShiftInserts;
13554 return ResultSLI;
13555}
13556
13557SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13558 SelectionDAG &DAG) const {
13559 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13560 !Subtarget->isNeonAvailable()))
13561 return LowerToScalableOp(Op, DAG);
13562
13563 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13564 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13565 return Res;
13566
13567 EVT VT = Op.getValueType();
13568 if (VT.isScalableVector())
13569 return Op;
13570
13571 SDValue LHS = Op.getOperand(0);
13572 BuildVectorSDNode *BVN =
13573 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13574 if (!BVN) {
13575 // OR commutes, so try swapping the operands.
13576 LHS = Op.getOperand(1);
13577 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13578 }
13579 if (!BVN)
13580 return Op;
13581
13582 APInt DefBits(VT.getSizeInBits(), 0);
13583 APInt UndefBits(VT.getSizeInBits(), 0);
13584 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13585 SDValue NewOp;
13586
13587 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13588 DefBits, &LHS)) ||
13589 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13590 DefBits, &LHS)))
13591 return NewOp;
13592
13593 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13594 UndefBits, &LHS)) ||
13595 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13596 UndefBits, &LHS)))
13597 return NewOp;
13598 }
13599
13600 // We can always fall back to a non-immediate OR.
13601 return Op;
13602}
13603
13604// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13605// be truncated to fit element width.
13607 SelectionDAG &DAG) {
13608 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13609 SDLoc dl(Op);
13610 EVT VT = Op.getValueType();
13611 EVT EltTy= VT.getVectorElementType();
13612
13613 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13614 return Op;
13615
13617 for (SDValue Lane : Op->ops()) {
13618 // For integer vectors, type legalization would have promoted the
13619 // operands already. Otherwise, if Op is a floating-point splat
13620 // (with operands cast to integers), then the only possibilities
13621 // are constants and UNDEFs.
13622 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13623 APInt LowBits(EltTy.getSizeInBits(),
13624 CstLane->getZExtValue());
13625 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13626 } else if (Lane.getNode()->isUndef()) {
13627 Lane = DAG.getUNDEF(MVT::i32);
13628 } else {
13629 assert(Lane.getValueType() == MVT::i32 &&
13630 "Unexpected BUILD_VECTOR operand type");
13631 }
13632 Ops.push_back(Lane);
13633 }
13634 return DAG.getBuildVector(VT, dl, Ops);
13635}
13636
13638 const AArch64Subtarget *ST) {
13639 EVT VT = Op.getValueType();
13640 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13641 "Expected a legal NEON vector");
13642
13643 APInt DefBits(VT.getSizeInBits(), 0);
13644 APInt UndefBits(VT.getSizeInBits(), 0);
13645 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13646 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13647 auto TryMOVIWithBits = [&](APInt DefBits) {
13648 SDValue NewOp;
13649 if ((NewOp =
13650 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13651 (NewOp =
13652 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13653 (NewOp =
13654 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13655 (NewOp =
13656 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13657 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13658 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13659 return NewOp;
13660
13661 APInt NotDefBits = ~DefBits;
13662 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13663 NotDefBits)) ||
13665 NotDefBits)) ||
13666 (NewOp =
13667 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13668 return NewOp;
13669 return SDValue();
13670 };
13671 if (SDValue R = TryMOVIWithBits(DefBits))
13672 return R;
13673 if (SDValue R = TryMOVIWithBits(UndefBits))
13674 return R;
13675
13676 // See if a fneg of the constant can be materialized with a MOVI, etc
13677 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13678 // FNegate each sub-element of the constant
13679 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13680 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13681 .zext(VT.getSizeInBits());
13682 APInt NegBits(VT.getSizeInBits(), 0);
13683 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13684 for (unsigned i = 0; i < NumElts; i++)
13685 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13686 NegBits = DefBits ^ NegBits;
13687
13688 // Try to create the new constants with MOVI, and if so generate a fneg
13689 // for it.
13690 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13691 SDLoc DL(Op);
13692 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13693 return DAG.getNode(
13695 DAG.getNode(ISD::FNEG, DL, VFVT,
13696 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13697 }
13698 return SDValue();
13699 };
13700 SDValue R;
13701 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13702 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13703 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13704 return R;
13705 }
13706
13707 return SDValue();
13708}
13709
13710SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13711 SelectionDAG &DAG) const {
13712 EVT VT = Op.getValueType();
13713
13714 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13715 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13716 SDLoc DL(Op);
13717 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13718 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13719 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13720 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13721 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13722 }
13723
13724 // Revert to common legalisation for all other variants.
13725 return SDValue();
13726 }
13727
13728 // Try to build a simple constant vector.
13729 Op = NormalizeBuildVector(Op, DAG);
13730 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13731 // abort.
13732 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13733 return SDValue();
13734
13735 // Certain vector constants, used to express things like logical NOT and
13736 // arithmetic NEG, are passed through unmodified. This allows special
13737 // patterns for these operations to match, which will lower these constants
13738 // to whatever is proven necessary.
13739 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13740 if (BVN->isConstant()) {
13741 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13742 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13743 APInt Val(BitSize,
13744 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13745 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13746 return Op;
13747 }
13748 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13749 if (Const->isZero() && !Const->isNegative())
13750 return Op;
13751 }
13752
13753 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13754 return V;
13755
13756 // Scan through the operands to find some interesting properties we can
13757 // exploit:
13758 // 1) If only one value is used, we can use a DUP, or
13759 // 2) if only the low element is not undef, we can just insert that, or
13760 // 3) if only one constant value is used (w/ some non-constant lanes),
13761 // we can splat the constant value into the whole vector then fill
13762 // in the non-constant lanes.
13763 // 4) FIXME: If different constant values are used, but we can intelligently
13764 // select the values we'll be overwriting for the non-constant
13765 // lanes such that we can directly materialize the vector
13766 // some other way (MOVI, e.g.), we can be sneaky.
13767 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13768 SDLoc dl(Op);
13769 unsigned NumElts = VT.getVectorNumElements();
13770 bool isOnlyLowElement = true;
13771 bool usesOnlyOneValue = true;
13772 bool usesOnlyOneConstantValue = true;
13773 bool isConstant = true;
13774 bool AllLanesExtractElt = true;
13775 unsigned NumConstantLanes = 0;
13776 unsigned NumDifferentLanes = 0;
13777 unsigned NumUndefLanes = 0;
13778 SDValue Value;
13779 SDValue ConstantValue;
13780 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13781 unsigned ConsecutiveValCount = 0;
13782 SDValue PrevVal;
13783 for (unsigned i = 0; i < NumElts; ++i) {
13784 SDValue V = Op.getOperand(i);
13785 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13786 AllLanesExtractElt = false;
13787 if (V.isUndef()) {
13788 ++NumUndefLanes;
13789 continue;
13790 }
13791 if (i > 0)
13792 isOnlyLowElement = false;
13793 if (!isIntOrFPConstant(V))
13794 isConstant = false;
13795
13796 if (isIntOrFPConstant(V)) {
13797 ++NumConstantLanes;
13798 if (!ConstantValue.getNode())
13799 ConstantValue = V;
13800 else if (ConstantValue != V)
13801 usesOnlyOneConstantValue = false;
13802 }
13803
13804 if (!Value.getNode())
13805 Value = V;
13806 else if (V != Value) {
13807 usesOnlyOneValue = false;
13808 ++NumDifferentLanes;
13809 }
13810
13811 if (PrevVal != V) {
13812 ConsecutiveValCount = 0;
13813 PrevVal = V;
13814 }
13815
13816 // Keep different values and its last consecutive count. For example,
13817 //
13818 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13819 // t24, t24, t24, t24, t24, t24, t24, t24
13820 // t23 = consecutive count 8
13821 // t24 = consecutive count 8
13822 // ------------------------------------------------------------------
13823 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13824 // t24, t24, t24, t24, t24, t24, t24, t24
13825 // t23 = consecutive count 5
13826 // t24 = consecutive count 9
13827 DifferentValueMap[V] = ++ConsecutiveValCount;
13828 }
13829
13830 if (!Value.getNode()) {
13831 LLVM_DEBUG(
13832 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13833 return DAG.getUNDEF(VT);
13834 }
13835
13836 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13837 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13838 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13839 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13840 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13841 "SCALAR_TO_VECTOR node\n");
13842 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13843 }
13844
13845 if (AllLanesExtractElt) {
13846 SDNode *Vector = nullptr;
13847 bool Even = false;
13848 bool Odd = false;
13849 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13850 // the Odd pattern <1,3,5,...>.
13851 for (unsigned i = 0; i < NumElts; ++i) {
13852 SDValue V = Op.getOperand(i);
13853 const SDNode *N = V.getNode();
13854 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13855 Even = false;
13856 Odd = false;
13857 break;
13858 }
13859 SDValue N0 = N->getOperand(0);
13860
13861 // All elements are extracted from the same vector.
13862 if (!Vector) {
13863 Vector = N0.getNode();
13864 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13865 // BUILD_VECTOR.
13866 if (VT.getVectorElementType() !=
13868 break;
13869 } else if (Vector != N0.getNode()) {
13870 Odd = false;
13871 Even = false;
13872 break;
13873 }
13874
13875 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13876 // indices <1,3,5,...>.
13877 uint64_t Val = N->getConstantOperandVal(1);
13878 if (Val == 2 * i) {
13879 Even = true;
13880 continue;
13881 }
13882 if (Val - 1 == 2 * i) {
13883 Odd = true;
13884 continue;
13885 }
13886
13887 // Something does not match: abort.
13888 Odd = false;
13889 Even = false;
13890 break;
13891 }
13892 if (Even || Odd) {
13893 SDValue LHS =
13895 DAG.getConstant(0, dl, MVT::i64));
13896 SDValue RHS =
13898 DAG.getConstant(NumElts, dl, MVT::i64));
13899
13900 if (Even && !Odd)
13901 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
13902 if (Odd && !Even)
13903 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
13904 }
13905 }
13906
13907 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13908 // i32 and try again.
13909 if (usesOnlyOneValue) {
13910 if (!isConstant) {
13911 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13912 Value.getValueType() != VT) {
13913 LLVM_DEBUG(
13914 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13915 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13916 }
13917
13918 // This is actually a DUPLANExx operation, which keeps everything vectory.
13919
13920 SDValue Lane = Value.getOperand(1);
13921 Value = Value.getOperand(0);
13922 if (Value.getValueSizeInBits() == 64) {
13923 LLVM_DEBUG(
13924 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13925 "widening it\n");
13926 Value = WidenVector(Value, DAG);
13927 }
13928
13929 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13930 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13931 }
13932
13935 EVT EltTy = VT.getVectorElementType();
13936 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13937 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13938 LLVM_DEBUG(
13939 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13940 "BITCASTS, and try again\n");
13941 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13942 for (unsigned i = 0; i < NumElts; ++i)
13943 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13944 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13945 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13946 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13947 Val.dump(););
13948 Val = LowerBUILD_VECTOR(Val, DAG);
13949 if (Val.getNode())
13950 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13951 }
13952 }
13953
13954 // If we need to insert a small number of different non-constant elements and
13955 // the vector width is sufficiently large, prefer using DUP with the common
13956 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13957 // skip the constant lane handling below.
13958 bool PreferDUPAndInsert =
13959 !isConstant && NumDifferentLanes >= 1 &&
13960 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13961 NumDifferentLanes >= NumConstantLanes;
13962
13963 // If there was only one constant value used and for more than one lane,
13964 // start by splatting that value, then replace the non-constant lanes. This
13965 // is better than the default, which will perform a separate initialization
13966 // for each lane.
13967 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13968 // Firstly, try to materialize the splat constant.
13969 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13970 unsigned BitSize = VT.getScalarSizeInBits();
13971 APInt ConstantValueAPInt(1, 0);
13972 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13973 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13974 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13975 !ConstantValueAPInt.isAllOnes()) {
13976 Val = ConstantBuildVector(Val, DAG, Subtarget);
13977 if (!Val)
13978 // Otherwise, materialize the constant and splat it.
13979 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13980 }
13981
13982 // Now insert the non-constant lanes.
13983 for (unsigned i = 0; i < NumElts; ++i) {
13984 SDValue V = Op.getOperand(i);
13985 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13986 if (!isIntOrFPConstant(V))
13987 // Note that type legalization likely mucked about with the VT of the
13988 // source operand, so we may have to convert it here before inserting.
13989 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13990 }
13991 return Val;
13992 }
13993
13994 // This will generate a load from the constant pool.
13995 if (isConstant) {
13996 LLVM_DEBUG(
13997 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13998 "expansion\n");
13999 return SDValue();
14000 }
14001
14002 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14003 // v4i32s. This is really a truncate, which we can construct out of (legal)
14004 // concats and truncate nodes.
14006 return M;
14007
14008 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
14009 if (NumElts >= 4) {
14010 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
14011 return Shuffle;
14012
14013 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
14014 return Shuffle;
14015 }
14016
14017 if (PreferDUPAndInsert) {
14018 // First, build a constant vector with the common element.
14019 SmallVector<SDValue, 8> Ops(NumElts, Value);
14020 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
14021 // Next, insert the elements that do not match the common value.
14022 for (unsigned I = 0; I < NumElts; ++I)
14023 if (Op.getOperand(I) != Value)
14024 NewVector =
14025 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
14026 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
14027
14028 return NewVector;
14029 }
14030
14031 // If vector consists of two different values, try to generate two DUPs and
14032 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
14033 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
14035 // Check the consecutive count of the value is the half number of vector
14036 // elements. In this case, we can use CONCAT_VECTORS. For example,
14037 //
14038 // canUseVECTOR_CONCAT = true;
14039 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14040 // t24, t24, t24, t24, t24, t24, t24, t24
14041 //
14042 // canUseVECTOR_CONCAT = false;
14043 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
14044 // t24, t24, t24, t24, t24, t24, t24, t24
14045 bool canUseVECTOR_CONCAT = true;
14046 for (auto Pair : DifferentValueMap) {
14047 // Check different values have same length which is NumElts / 2.
14048 if (Pair.second != NumElts / 2)
14049 canUseVECTOR_CONCAT = false;
14050 Vals.push_back(Pair.first);
14051 }
14052
14053 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
14054 // CONCAT_VECTORs. For example,
14055 //
14056 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
14057 // t24, t24, t24, t24, t24, t24, t24, t24
14058 // ==>
14059 // t26: v8i8 = AArch64ISD::DUP t23
14060 // t28: v8i8 = AArch64ISD::DUP t24
14061 // t29: v16i8 = concat_vectors t26, t28
14062 if (canUseVECTOR_CONCAT) {
14063 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14064 if (isTypeLegal(SubVT) && SubVT.isVector() &&
14065 SubVT.getVectorNumElements() >= 2) {
14066 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
14067 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
14068 SDValue DUP1 =
14069 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
14070 SDValue DUP2 =
14071 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
14073 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
14074 return CONCAT_VECTORS;
14075 }
14076 }
14077
14078 // Let's try to generate VECTOR_SHUFFLE. For example,
14079 //
14080 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
14081 // ==>
14082 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
14083 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
14084 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
14085 if (NumElts >= 8) {
14086 SmallVector<int, 16> MaskVec;
14087 // Build mask for VECTOR_SHUFLLE.
14088 SDValue FirstLaneVal = Op.getOperand(0);
14089 for (unsigned i = 0; i < NumElts; ++i) {
14090 SDValue Val = Op.getOperand(i);
14091 if (FirstLaneVal == Val)
14092 MaskVec.push_back(i);
14093 else
14094 MaskVec.push_back(i + NumElts);
14095 }
14096
14097 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
14098 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
14099 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
14100 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
14102 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
14103 return VECTOR_SHUFFLE;
14104 }
14105 }
14106
14107 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
14108 // know the default expansion would otherwise fall back on something even
14109 // worse. For a vector with one or two non-undef values, that's
14110 // scalar_to_vector for the elements followed by a shuffle (provided the
14111 // shuffle is valid for the target) and materialization element by element
14112 // on the stack followed by a load for everything else.
14113 if (!isConstant && !usesOnlyOneValue) {
14114 LLVM_DEBUG(
14115 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
14116 "of INSERT_VECTOR_ELT\n");
14117
14118 SDValue Vec = DAG.getUNDEF(VT);
14119 SDValue Op0 = Op.getOperand(0);
14120 unsigned i = 0;
14121
14122 // Use SCALAR_TO_VECTOR for lane zero to
14123 // a) Avoid a RMW dependency on the full vector register, and
14124 // b) Allow the register coalescer to fold away the copy if the
14125 // value is already in an S or D register, and we're forced to emit an
14126 // INSERT_SUBREG that we can't fold anywhere.
14127 //
14128 // We also allow types like i8 and i16 which are illegal scalar but legal
14129 // vector element types. After type-legalization the inserted value is
14130 // extended (i32) and it is safe to cast them to the vector type by ignoring
14131 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
14132 if (!Op0.isUndef()) {
14133 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
14134 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
14135 ++i;
14136 }
14137 LLVM_DEBUG(if (i < NumElts) dbgs()
14138 << "Creating nodes for the other vector elements:\n";);
14139 for (; i < NumElts; ++i) {
14140 SDValue V = Op.getOperand(i);
14141 if (V.isUndef())
14142 continue;
14143 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14144 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
14145 }
14146 return Vec;
14147 }
14148
14149 LLVM_DEBUG(
14150 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
14151 "better alternative\n");
14152 return SDValue();
14153}
14154
14155SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
14156 SelectionDAG &DAG) const {
14157 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14158 !Subtarget->isNeonAvailable()))
14159 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
14160
14161 assert(Op.getValueType().isScalableVector() &&
14162 isTypeLegal(Op.getValueType()) &&
14163 "Expected legal scalable vector type!");
14164
14165 if (isTypeLegal(Op.getOperand(0).getValueType())) {
14166 unsigned NumOperands = Op->getNumOperands();
14167 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
14168 "Unexpected number of operands in CONCAT_VECTORS");
14169
14170 if (NumOperands == 2)
14171 return Op;
14172
14173 // Concat each pair of subvectors and pack into the lower half of the array.
14174 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
14175 while (ConcatOps.size() > 1) {
14176 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
14177 SDValue V1 = ConcatOps[I];
14178 SDValue V2 = ConcatOps[I + 1];
14179 EVT SubVT = V1.getValueType();
14180 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
14181 ConcatOps[I / 2] =
14182 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
14183 }
14184 ConcatOps.resize(ConcatOps.size() / 2);
14185 }
14186 return ConcatOps[0];
14187 }
14188
14189 return SDValue();
14190}
14191
14192SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14193 SelectionDAG &DAG) const {
14194 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
14195
14196 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14197 !Subtarget->isNeonAvailable()))
14198 return LowerFixedLengthInsertVectorElt(Op, DAG);
14199
14200 EVT VT = Op.getOperand(0).getValueType();
14201
14202 if (VT.getScalarType() == MVT::i1) {
14203 EVT VectorVT = getPromotedVTForPredicate(VT);
14204 SDLoc DL(Op);
14205 SDValue ExtendedVector =
14206 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
14207 SDValue ExtendedValue =
14208 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
14209 VectorVT.getScalarType().getSizeInBits() < 32
14210 ? MVT::i32
14211 : VectorVT.getScalarType());
14212 ExtendedVector =
14213 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
14214 ExtendedValue, Op.getOperand(2));
14215 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
14216 }
14217
14218 // Check for non-constant or out of range lane.
14219 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
14220 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14221 return SDValue();
14222
14223 return Op;
14224}
14225
14226SDValue
14227AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14228 SelectionDAG &DAG) const {
14229 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
14230 EVT VT = Op.getOperand(0).getValueType();
14231
14232 if (VT.getScalarType() == MVT::i1) {
14233 // We can't directly extract from an SVE predicate; extend it first.
14234 // (This isn't the only possible lowering, but it's straightforward.)
14235 EVT VectorVT = getPromotedVTForPredicate(VT);
14236 SDLoc DL(Op);
14237 SDValue Extend =
14238 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
14239 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
14240 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
14241 Extend, Op.getOperand(1));
14242 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
14243 }
14244
14245 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14246 return LowerFixedLengthExtractVectorElt(Op, DAG);
14247
14248 // Check for non-constant or out of range lane.
14249 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14250 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14251 return SDValue();
14252
14253 // Insertion/extraction are legal for V128 types.
14254 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
14255 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
14256 VT == MVT::v8f16 || VT == MVT::v8bf16)
14257 return Op;
14258
14259 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
14260 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
14261 VT != MVT::v4bf16)
14262 return SDValue();
14263
14264 // For V64 types, we perform extraction by expanding the value
14265 // to a V128 type and perform the extraction on that.
14266 SDLoc DL(Op);
14267 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
14268 EVT WideTy = WideVec.getValueType();
14269
14270 EVT ExtrTy = WideTy.getVectorElementType();
14271 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
14272 ExtrTy = MVT::i32;
14273
14274 // For extractions, we just return the result directly.
14275 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
14276 Op.getOperand(1));
14277}
14278
14279SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14280 SelectionDAG &DAG) const {
14281 EVT VT = Op.getValueType();
14283 "Only cases that extract a fixed length vector are supported!");
14284 EVT InVT = Op.getOperand(0).getValueType();
14285
14286 // If we don't have legal types yet, do nothing
14287 if (!isTypeLegal(InVT))
14288 return SDValue();
14289
14290 if (InVT.is128BitVector()) {
14291 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
14292 unsigned Idx = Op.getConstantOperandVal(1);
14293
14294 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
14295 if (Idx == 0)
14296 return Op;
14297
14298 // If this is extracting the upper 64-bits of a 128-bit vector, we match
14299 // that directly.
14300 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
14301 return Op;
14302 }
14303
14304 if (InVT.isScalableVector() ||
14305 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
14306 SDLoc DL(Op);
14307 SDValue Vec = Op.getOperand(0);
14308 SDValue Idx = Op.getOperand(1);
14309
14311 if (PackedVT != InVT) {
14312 // Pack input into the bottom part of an SVE register and try again.
14313 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
14314 DAG.getUNDEF(PackedVT), Vec,
14315 DAG.getVectorIdxConstant(0, DL));
14316 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
14317 }
14318
14319 // This will get matched by custom code during ISelDAGToDAG.
14320 if (isNullConstant(Idx))
14321 return Op;
14322
14323 assert(InVT.isScalableVector() && "Unexpected vector type!");
14324 // Move requested subvector to the start of the vector and try again.
14325 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
14326 return convertFromScalableVector(DAG, VT, Splice);
14327 }
14328
14329 return SDValue();
14330}
14331
14332SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
14333 SelectionDAG &DAG) const {
14334 assert(Op.getValueType().isScalableVector() &&
14335 "Only expect to lower inserts into scalable vectors!");
14336
14337 EVT InVT = Op.getOperand(1).getValueType();
14338 unsigned Idx = Op.getConstantOperandVal(2);
14339
14340 SDValue Vec0 = Op.getOperand(0);
14341 SDValue Vec1 = Op.getOperand(1);
14342 SDLoc DL(Op);
14343 EVT VT = Op.getValueType();
14344
14345 if (InVT.isScalableVector()) {
14346 if (!isTypeLegal(VT))
14347 return SDValue();
14348
14349 // Break down insert_subvector into simpler parts.
14350 if (VT.getVectorElementType() == MVT::i1) {
14351 unsigned NumElts = VT.getVectorMinNumElements();
14352 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14353
14354 SDValue Lo, Hi;
14355 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
14356 DAG.getVectorIdxConstant(0, DL));
14357 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
14358 DAG.getVectorIdxConstant(NumElts / 2, DL));
14359 if (Idx < (NumElts / 2))
14360 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
14362 else
14363 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
14364 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
14365
14366 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14367 }
14368
14369 // Ensure the subvector is half the size of the main vector.
14370 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
14371 return SDValue();
14372
14373 // Here narrow and wide refers to the vector element types. After "casting"
14374 // both vectors must have the same bit length and so because the subvector
14375 // has fewer elements, those elements need to be bigger.
14378
14379 // NOP cast operands to the largest legal vector of the same element count.
14380 if (VT.isFloatingPoint()) {
14381 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
14382 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
14383 } else {
14384 // Legal integer vectors are already their largest so Vec0 is fine as is.
14385 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
14386 }
14387
14388 // To replace the top/bottom half of vector V with vector SubV we widen the
14389 // preserved half of V, concatenate this to SubV (the order depending on the
14390 // half being replaced) and then narrow the result.
14391 SDValue Narrow;
14392 if (Idx == 0) {
14393 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14394 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14395 } else {
14397 "Invalid subvector index!");
14398 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14399 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14400 }
14401
14402 return getSVESafeBitCast(VT, Narrow, DAG);
14403 }
14404
14405 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14406 // This will be matched by custom code during ISelDAGToDAG.
14407 if (Vec0.isUndef())
14408 return Op;
14409
14410 std::optional<unsigned> PredPattern =
14412 auto PredTy = VT.changeVectorElementType(MVT::i1);
14413 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14414 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14415 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14416 }
14417
14418 return SDValue();
14419}
14420
14421static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14422 if (Op.getOpcode() != AArch64ISD::DUP &&
14423 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14424 Op.getOpcode() != ISD::BUILD_VECTOR)
14425 return false;
14426
14427 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14428 !isAllConstantBuildVector(Op, SplatVal))
14429 return false;
14430
14431 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14432 !isa<ConstantSDNode>(Op->getOperand(0)))
14433 return false;
14434
14435 SplatVal = Op->getConstantOperandVal(0);
14436 if (Op.getValueType().getVectorElementType() != MVT::i64)
14437 SplatVal = (int32_t)SplatVal;
14438
14439 Negated = false;
14440 if (isPowerOf2_64(SplatVal))
14441 return true;
14442
14443 Negated = true;
14444 if (isPowerOf2_64(-SplatVal)) {
14445 SplatVal = -SplatVal;
14446 return true;
14447 }
14448
14449 return false;
14450}
14451
14452SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14453 EVT VT = Op.getValueType();
14454 SDLoc dl(Op);
14455
14456 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14457 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14458
14459 assert(VT.isScalableVector() && "Expected a scalable vector.");
14460
14461 bool Signed = Op.getOpcode() == ISD::SDIV;
14462 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14463
14464 bool Negated;
14465 uint64_t SplatVal;
14466 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14467 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14468 SDValue Res =
14469 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14470 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14471 if (Negated)
14472 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14473
14474 return Res;
14475 }
14476
14477 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14478 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14479
14480 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14481 // operations, and truncate the result.
14482 EVT WidenedVT;
14483 if (VT == MVT::nxv16i8)
14484 WidenedVT = MVT::nxv8i16;
14485 else if (VT == MVT::nxv8i16)
14486 WidenedVT = MVT::nxv4i32;
14487 else
14488 llvm_unreachable("Unexpected Custom DIV operation");
14489
14490 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14491 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14492 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14493 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14494 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14495 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14496 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14497 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14498 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14499}
14500
14501bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
14502 EVT VT, unsigned DefinedValues) const {
14503 if (!Subtarget->isNeonAvailable())
14504 return false;
14506}
14507
14509 // Currently no fixed length shuffles that require SVE are legal.
14510 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14511 return false;
14512
14513 if (VT.getVectorNumElements() == 4 &&
14514 (VT.is128BitVector() || VT.is64BitVector())) {
14515 unsigned Cost = getPerfectShuffleCost(M);
14516 if (Cost <= 1)
14517 return true;
14518 }
14519
14520 bool DummyBool;
14521 int DummyInt;
14522 unsigned DummyUnsigned;
14523
14524 unsigned EltSize = VT.getScalarSizeInBits();
14525 unsigned NumElts = VT.getVectorNumElements();
14526 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
14527 isREVMask(M, EltSize, NumElts, 64) ||
14528 isREVMask(M, EltSize, NumElts, 32) ||
14529 isREVMask(M, EltSize, NumElts, 16) ||
14530 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14531 isTRNMask(M, NumElts, DummyUnsigned) ||
14532 isUZPMask(M, NumElts, DummyUnsigned) ||
14533 isZIPMask(M, NumElts, DummyUnsigned) ||
14534 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14535 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14536 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14537 isINSMask(M, NumElts, DummyBool, DummyInt) ||
14538 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14539}
14540
14542 EVT VT) const {
14543 // Just delegate to the generic legality, clear masks aren't special.
14544 return isShuffleMaskLegal(M, VT);
14545}
14546
14547/// getVShiftImm - Check if this is a valid build_vector for the immediate
14548/// operand of a vector shift operation, where all the elements of the
14549/// build_vector must have the same constant integer value.
14550static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14551 // Ignore bit_converts.
14552 while (Op.getOpcode() == ISD::BITCAST)
14553 Op = Op.getOperand(0);
14554 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14555 APInt SplatBits, SplatUndef;
14556 unsigned SplatBitSize;
14557 bool HasAnyUndefs;
14558 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14559 HasAnyUndefs, ElementBits) ||
14560 SplatBitSize > ElementBits)
14561 return false;
14562 Cnt = SplatBits.getSExtValue();
14563 return true;
14564}
14565
14566/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14567/// operand of a vector shift left operation. That value must be in the range:
14568/// 0 <= Value < ElementBits for a left shift; or
14569/// 0 <= Value <= ElementBits for a long left shift.
14570static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14571 assert(VT.isVector() && "vector shift count is not a vector type");
14572 int64_t ElementBits = VT.getScalarSizeInBits();
14573 if (!getVShiftImm(Op, ElementBits, Cnt))
14574 return false;
14575 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14576}
14577
14578/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14579/// operand of a vector shift right operation. The value must be in the range:
14580/// 1 <= Value <= ElementBits for a right shift; or
14581static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14582 assert(VT.isVector() && "vector shift count is not a vector type");
14583 int64_t ElementBits = VT.getScalarSizeInBits();
14584 if (!getVShiftImm(Op, ElementBits, Cnt))
14585 return false;
14586 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14587}
14588
14589SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14590 SelectionDAG &DAG) const {
14591 EVT VT = Op.getValueType();
14592
14593 if (VT.getScalarType() == MVT::i1) {
14594 // Lower i1 truncate to `(x & 1) != 0`.
14595 SDLoc dl(Op);
14596 EVT OpVT = Op.getOperand(0).getValueType();
14597 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14598 SDValue One = DAG.getConstant(1, dl, OpVT);
14599 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14600 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14601 }
14602
14603 if (!VT.isVector() || VT.isScalableVector())
14604 return SDValue();
14605
14606 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14607 !Subtarget->isNeonAvailable()))
14608 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14609
14610 return SDValue();
14611}
14612
14613// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14614// possibly a truncated type, it tells how many bits of the value are to be
14615// used.
14617 SelectionDAG &DAG,
14618 unsigned &ShiftValue,
14619 SDValue &RShOperand) {
14620 if (Shift->getOpcode() != ISD::SRL)
14621 return false;
14622
14623 EVT VT = Shift.getValueType();
14624 assert(VT.isScalableVT());
14625
14626 auto ShiftOp1 =
14627 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14628 if (!ShiftOp1)
14629 return false;
14630
14631 ShiftValue = ShiftOp1->getZExtValue();
14632 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14633 return false;
14634
14635 SDValue Add = Shift->getOperand(0);
14636 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14637 return false;
14638
14640 "ResVT must be truncated or same type as the shift.");
14641 // Check if an overflow can lead to incorrect results.
14642 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14643 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14644 return false;
14645
14646 auto AddOp1 =
14647 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14648 if (!AddOp1)
14649 return false;
14650 uint64_t AddValue = AddOp1->getZExtValue();
14651 if (AddValue != 1ULL << (ShiftValue - 1))
14652 return false;
14653
14654 RShOperand = Add->getOperand(0);
14655 return true;
14656}
14657
14658SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14659 SelectionDAG &DAG) const {
14660 EVT VT = Op.getValueType();
14661 SDLoc DL(Op);
14662 int64_t Cnt;
14663
14664 if (!Op.getOperand(1).getValueType().isVector())
14665 return Op;
14666 unsigned EltSize = VT.getScalarSizeInBits();
14667
14668 switch (Op.getOpcode()) {
14669 case ISD::SHL:
14670 if (VT.isScalableVector() ||
14672 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14673
14674 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14675 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14676 DAG.getConstant(Cnt, DL, MVT::i32));
14677 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14678 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14679 MVT::i32),
14680 Op.getOperand(0), Op.getOperand(1));
14681 case ISD::SRA:
14682 case ISD::SRL:
14683 if (VT.isScalableVector() &&
14684 (Subtarget->hasSVE2() ||
14685 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
14686 SDValue RShOperand;
14687 unsigned ShiftValue;
14688 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14689 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14690 getPredicateForVector(DAG, DL, VT), RShOperand,
14691 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14692 }
14693
14694 if (VT.isScalableVector() ||
14695 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14696 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14698 return LowerToPredicatedOp(Op, DAG, Opc);
14699 }
14700
14701 // Right shift immediate
14702 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14703 unsigned Opc =
14704 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14705 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14706 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
14707 }
14708
14709 // Right shift register. Note, there is not a shift right register
14710 // instruction, but the shift left register instruction takes a signed
14711 // value, where negative numbers specify a right shift.
14712 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14713 : Intrinsic::aarch64_neon_ushl;
14714 // negate the shift amount
14715 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14716 Op.getOperand(1));
14717 SDValue NegShiftLeft =
14719 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14720 NegShift);
14721 return NegShiftLeft;
14722 }
14723
14724 llvm_unreachable("unexpected shift opcode");
14725}
14726
14728 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14729 const SDLoc &dl, SelectionDAG &DAG) {
14730 EVT SrcVT = LHS.getValueType();
14731 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14732 "function only supposed to emit natural comparisons");
14733
14734 APInt SplatValue;
14735 APInt SplatUndef;
14736 unsigned SplatBitSize = 0;
14737 bool HasAnyUndefs;
14738
14739 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14740 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14741 SplatBitSize, HasAnyUndefs);
14742
14743 bool IsZero = IsCnst && SplatValue == 0;
14744 bool IsOne =
14745 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14746 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14747
14748 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14749 switch (CC) {
14750 default:
14751 return SDValue();
14752 case AArch64CC::NE: {
14753 SDValue Fcmeq;
14754 if (IsZero)
14755 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14756 else
14757 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14758 return DAG.getNOT(dl, Fcmeq, VT);
14759 }
14760 case AArch64CC::EQ:
14761 if (IsZero)
14762 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14763 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14764 case AArch64CC::GE:
14765 if (IsZero)
14766 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14767 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14768 case AArch64CC::GT:
14769 if (IsZero)
14770 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14771 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14772 case AArch64CC::LE:
14773 if (!NoNans)
14774 return SDValue();
14775 // If we ignore NaNs then we can use to the LS implementation.
14776 [[fallthrough]];
14777 case AArch64CC::LS:
14778 if (IsZero)
14779 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14780 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14781 case AArch64CC::LT:
14782 if (!NoNans)
14783 return SDValue();
14784 // If we ignore NaNs then we can use to the MI implementation.
14785 [[fallthrough]];
14786 case AArch64CC::MI:
14787 if (IsZero)
14788 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14789 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14790 }
14791 }
14792
14793 switch (CC) {
14794 default:
14795 return SDValue();
14796 case AArch64CC::NE: {
14797 SDValue Cmeq;
14798 if (IsZero)
14799 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14800 else
14801 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14802 return DAG.getNOT(dl, Cmeq, VT);
14803 }
14804 case AArch64CC::EQ:
14805 if (IsZero)
14806 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14807 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14808 case AArch64CC::GE:
14809 if (IsZero)
14810 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14811 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14812 case AArch64CC::GT:
14813 if (IsZero)
14814 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14815 if (IsMinusOne)
14816 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14817 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14818 case AArch64CC::LE:
14819 if (IsZero)
14820 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14821 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14822 case AArch64CC::LS:
14823 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14824 case AArch64CC::LO:
14825 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14826 case AArch64CC::LT:
14827 if (IsZero)
14828 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14829 if (IsOne)
14830 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14831 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14832 case AArch64CC::HI:
14833 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14834 case AArch64CC::HS:
14835 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14836 }
14837}
14838
14839SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14840 SelectionDAG &DAG) const {
14841 if (Op.getValueType().isScalableVector())
14842 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14843
14844 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14845 !Subtarget->isNeonAvailable()))
14846 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14847
14848 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14849 SDValue LHS = Op.getOperand(0);
14850 SDValue RHS = Op.getOperand(1);
14851 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14852 SDLoc dl(Op);
14853
14854 if (LHS.getValueType().getVectorElementType().isInteger()) {
14855 assert(LHS.getValueType() == RHS.getValueType());
14857 SDValue Cmp =
14858 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14859 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14860 }
14861
14862 // Lower isnan(x) | isnan(never-nan) to x != x.
14863 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14864 if (CC == ISD::SETUO || CC == ISD::SETO) {
14865 bool OneNaN = false;
14866 if (LHS == RHS) {
14867 OneNaN = true;
14868 } else if (DAG.isKnownNeverNaN(RHS)) {
14869 OneNaN = true;
14870 RHS = LHS;
14871 } else if (DAG.isKnownNeverNaN(LHS)) {
14872 OneNaN = true;
14873 LHS = RHS;
14874 }
14875 if (OneNaN) {
14877 }
14878 }
14879
14880 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14881
14882 // Make v4f16 (only) fcmp operations utilise vector instructions
14883 // v8f16 support will be a litle more complicated
14884 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14885 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14886 if (LHS.getValueType().getVectorNumElements() == 4) {
14887 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14888 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14889 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14890 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14891 CmpVT = MVT::v4i32;
14892 } else
14893 return SDValue();
14894 }
14895
14896 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14897 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14898 LHS.getValueType().getVectorElementType() != MVT::f128);
14899
14900 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14901 // clean. Some of them require two branches to implement.
14902 AArch64CC::CondCode CC1, CC2;
14903 bool ShouldInvert;
14904 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14905
14906 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14907 SDValue Cmp =
14908 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14909 if (!Cmp.getNode())
14910 return SDValue();
14911
14912 if (CC2 != AArch64CC::AL) {
14913 SDValue Cmp2 =
14914 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14915 if (!Cmp2.getNode())
14916 return SDValue();
14917
14918 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14919 }
14920
14921 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14922
14923 if (ShouldInvert)
14924 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14925
14926 return Cmp;
14927}
14928
14929static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14930 SelectionDAG &DAG) {
14931 SDValue VecOp = ScalarOp.getOperand(0);
14932 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14933 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14934 DAG.getConstant(0, DL, MVT::i64));
14935}
14936
14937static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14938 SDLoc DL, SelectionDAG &DAG) {
14939 unsigned ScalarOpcode;
14940 switch (Opcode) {
14941 case ISD::VECREDUCE_AND:
14942 ScalarOpcode = ISD::AND;
14943 break;
14944 case ISD::VECREDUCE_OR:
14945 ScalarOpcode = ISD::OR;
14946 break;
14947 case ISD::VECREDUCE_XOR:
14948 ScalarOpcode = ISD::XOR;
14949 break;
14950 default:
14951 llvm_unreachable("Expected bitwise vector reduction");
14952 return SDValue();
14953 }
14954
14955 EVT VecVT = Vec.getValueType();
14956 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14957 "Expected power-of-2 length vector");
14958
14959 EVT ElemVT = VecVT.getVectorElementType();
14960
14961 SDValue Result;
14962 unsigned NumElems = VecVT.getVectorNumElements();
14963
14964 // Special case for boolean reductions
14965 if (ElemVT == MVT::i1) {
14966 // Split large vectors into smaller ones
14967 if (NumElems > 16) {
14968 SDValue Lo, Hi;
14969 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14970 EVT HalfVT = Lo.getValueType();
14971 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14972 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14973 }
14974
14975 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14976 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14977 // this element size leads to the best codegen, since e.g. setcc results
14978 // might need to be truncated otherwise.
14979 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14980
14981 // any_ext doesn't work with umin/umax, so only use it for uadd.
14982 unsigned ExtendOp =
14983 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14984 SDValue Extended = DAG.getNode(
14985 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14986 switch (ScalarOpcode) {
14987 case ISD::AND:
14988 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14989 break;
14990 case ISD::OR:
14991 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14992 break;
14993 case ISD::XOR:
14994 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14995 break;
14996 default:
14997 llvm_unreachable("Unexpected Opcode");
14998 }
14999
15000 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
15001 } else {
15002 // Iteratively split the vector in half and combine using the bitwise
15003 // operation until it fits in a 64 bit register.
15004 while (VecVT.getSizeInBits() > 64) {
15005 SDValue Lo, Hi;
15006 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15007 VecVT = Lo.getValueType();
15008 NumElems = VecVT.getVectorNumElements();
15009 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
15010 }
15011
15012 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
15013
15014 // Do the remaining work on a scalar since it allows the code generator to
15015 // combine the shift and bitwise operation into one instruction and since
15016 // integer instructions can have higher throughput than vector instructions.
15017 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
15018
15019 // Iteratively combine the lower and upper halves of the scalar using the
15020 // bitwise operation, halving the relevant region of the scalar in each
15021 // iteration, until the relevant region is just one element of the original
15022 // vector.
15023 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
15024 SDValue ShiftAmount =
15025 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
15026 SDValue Shifted =
15027 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
15028 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
15029 }
15030
15031 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
15032 }
15033
15034 return DAG.getAnyExtOrTrunc(Result, DL, VT);
15035}
15036
15037SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
15038 SelectionDAG &DAG) const {
15039 SDValue Src = Op.getOperand(0);
15040
15041 // Try to lower fixed length reductions to SVE.
15042 EVT SrcVT = Src.getValueType();
15043 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15044 Op.getOpcode() == ISD::VECREDUCE_AND ||
15045 Op.getOpcode() == ISD::VECREDUCE_OR ||
15046 Op.getOpcode() == ISD::VECREDUCE_XOR ||
15047 Op.getOpcode() == ISD::VECREDUCE_FADD ||
15048 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
15049 SrcVT.getVectorElementType() == MVT::i64);
15050 if (SrcVT.isScalableVector() ||
15052 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
15053
15054 if (SrcVT.getVectorElementType() == MVT::i1)
15055 return LowerPredReductionToSVE(Op, DAG);
15056
15057 switch (Op.getOpcode()) {
15058 case ISD::VECREDUCE_ADD:
15059 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
15060 case ISD::VECREDUCE_AND:
15061 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
15062 case ISD::VECREDUCE_OR:
15063 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
15065 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
15067 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
15069 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
15071 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
15072 case ISD::VECREDUCE_XOR:
15073 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
15075 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
15077 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
15079 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
15081 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
15083 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
15084 default:
15085 llvm_unreachable("Unhandled fixed length reduction");
15086 }
15087 }
15088
15089 // Lower NEON reductions.
15090 SDLoc dl(Op);
15091 switch (Op.getOpcode()) {
15092 case ISD::VECREDUCE_AND:
15093 case ISD::VECREDUCE_OR:
15094 case ISD::VECREDUCE_XOR:
15095 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
15096 Op.getValueType(), dl, DAG);
15097 case ISD::VECREDUCE_ADD:
15098 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
15100 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
15102 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
15104 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
15106 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
15107 default:
15108 llvm_unreachable("Unhandled reduction");
15109 }
15110}
15111
15112SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
15113 SelectionDAG &DAG) const {
15114 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15115 // No point replacing if we don't have the relevant instruction/libcall anyway
15116 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
15117 return SDValue();
15118
15119 // LSE has an atomic load-clear instruction, but not a load-and.
15120 SDLoc dl(Op);
15121 MVT VT = Op.getSimpleValueType();
15122 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
15123 SDValue RHS = Op.getOperand(2);
15124 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
15125 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
15126 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
15127 Op.getOperand(0), Op.getOperand(1), RHS,
15128 AN->getMemOperand());
15129}
15130
15131SDValue
15132AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
15133 SelectionDAG &DAG) const {
15134
15135 SDLoc dl(Op);
15136 // Get the inputs.
15137 SDNode *Node = Op.getNode();
15138 SDValue Chain = Op.getOperand(0);
15139 SDValue Size = Op.getOperand(1);
15141 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
15142 EVT VT = Node->getValueType(0);
15143
15145 "no-stack-arg-probe")) {
15146 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15147 Chain = SP.getValue(1);
15148 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15149 if (Align)
15150 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15151 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15152 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
15153 SDValue Ops[2] = {SP, Chain};
15154 return DAG.getMergeValues(Ops, dl);
15155 }
15156
15157 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
15158
15159 EVT PtrVT = getPointerTy(DAG.getDataLayout());
15161 PtrVT, 0);
15162
15163 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
15164 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
15165 if (Subtarget->hasCustomCallingConv())
15166 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
15167
15168 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
15169 DAG.getConstant(4, dl, MVT::i64));
15170 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
15171 Chain =
15172 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
15173 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
15174 DAG.getRegisterMask(Mask), Chain.getValue(1));
15175 // To match the actual intent better, we should read the output from X15 here
15176 // again (instead of potentially spilling it to the stack), but rereading Size
15177 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
15178 // here.
15179
15180 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
15181 DAG.getConstant(4, dl, MVT::i64));
15182
15183 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15184 Chain = SP.getValue(1);
15185 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15186 if (Align)
15187 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15188 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15189 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
15190
15191 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
15192
15193 SDValue Ops[2] = {SP, Chain};
15194 return DAG.getMergeValues(Ops, dl);
15195}
15196
15197SDValue
15198AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
15199 SelectionDAG &DAG) const {
15200 // Get the inputs.
15201 SDNode *Node = Op.getNode();
15202 SDValue Chain = Op.getOperand(0);
15203 SDValue Size = Op.getOperand(1);
15204
15206 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
15207 SDLoc dl(Op);
15208 EVT VT = Node->getValueType(0);
15209
15210 // Construct the new SP value in a GPR.
15211 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15212 Chain = SP.getValue(1);
15213 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15214 if (Align)
15215 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15216 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15217
15218 // Set the real SP to the new value with a probing loop.
15219 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
15220 SDValue Ops[2] = {SP, Chain};
15221 return DAG.getMergeValues(Ops, dl);
15222}
15223
15224SDValue
15225AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
15226 SelectionDAG &DAG) const {
15228
15229 if (Subtarget->isTargetWindows())
15230 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
15231 else if (hasInlineStackProbe(MF))
15232 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
15233 else
15234 return SDValue();
15235}
15236
15237SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
15238 unsigned NewOp) const {
15239 if (Subtarget->hasSVE2())
15240 return LowerToPredicatedOp(Op, DAG, NewOp);
15241
15242 // Default to expand.
15243 return SDValue();
15244}
15245
15246SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
15247 SelectionDAG &DAG) const {
15248 EVT VT = Op.getValueType();
15249 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
15250
15251 SDLoc DL(Op);
15252 APInt MulImm = Op.getConstantOperandAPInt(0);
15253 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
15254 VT);
15255}
15256
15257/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
15258template <unsigned NumVecs>
15259static bool
15263 // Retrieve EC from first vector argument.
15264 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
15266#ifndef NDEBUG
15267 // Check the assumption that all input vectors are the same type.
15268 for (unsigned I = 0; I < NumVecs; ++I)
15269 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
15270 "Invalid type.");
15271#endif
15272 // memVT is `NumVecs * VT`.
15274 EC * NumVecs);
15275 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
15276 Info.offset = 0;
15277 Info.align.reset();
15279 return true;
15280}
15281
15282/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
15283/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
15284/// specified in the intrinsic calls.
15286 const CallInst &I,
15287 MachineFunction &MF,
15288 unsigned Intrinsic) const {
15289 auto &DL = I.getDataLayout();
15290 switch (Intrinsic) {
15291 case Intrinsic::aarch64_sve_st2:
15292 return setInfoSVEStN<2>(*this, DL, Info, I);
15293 case Intrinsic::aarch64_sve_st3:
15294 return setInfoSVEStN<3>(*this, DL, Info, I);
15295 case Intrinsic::aarch64_sve_st4:
15296 return setInfoSVEStN<4>(*this, DL, Info, I);
15297 case Intrinsic::aarch64_neon_ld2:
15298 case Intrinsic::aarch64_neon_ld3:
15299 case Intrinsic::aarch64_neon_ld4:
15300 case Intrinsic::aarch64_neon_ld1x2:
15301 case Intrinsic::aarch64_neon_ld1x3:
15302 case Intrinsic::aarch64_neon_ld1x4: {
15304 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
15305 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15306 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15307 Info.offset = 0;
15308 Info.align.reset();
15309 // volatile loads with NEON intrinsics not supported
15311 return true;
15312 }
15313 case Intrinsic::aarch64_neon_ld2lane:
15314 case Intrinsic::aarch64_neon_ld3lane:
15315 case Intrinsic::aarch64_neon_ld4lane:
15316 case Intrinsic::aarch64_neon_ld2r:
15317 case Intrinsic::aarch64_neon_ld3r:
15318 case Intrinsic::aarch64_neon_ld4r: {
15320 // ldx return struct with the same vec type
15321 Type *RetTy = I.getType();
15322 auto *StructTy = cast<StructType>(RetTy);
15323 unsigned NumElts = StructTy->getNumElements();
15324 Type *VecTy = StructTy->getElementType(0);
15325 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15326 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15327 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15328 Info.offset = 0;
15329 Info.align.reset();
15330 // volatile loads with NEON intrinsics not supported
15332 return true;
15333 }
15334 case Intrinsic::aarch64_neon_st2:
15335 case Intrinsic::aarch64_neon_st3:
15336 case Intrinsic::aarch64_neon_st4:
15337 case Intrinsic::aarch64_neon_st1x2:
15338 case Intrinsic::aarch64_neon_st1x3:
15339 case Intrinsic::aarch64_neon_st1x4: {
15341 unsigned NumElts = 0;
15342 for (const Value *Arg : I.args()) {
15343 Type *ArgTy = Arg->getType();
15344 if (!ArgTy->isVectorTy())
15345 break;
15346 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
15347 }
15348 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15349 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15350 Info.offset = 0;
15351 Info.align.reset();
15352 // volatile stores with NEON intrinsics not supported
15354 return true;
15355 }
15356 case Intrinsic::aarch64_neon_st2lane:
15357 case Intrinsic::aarch64_neon_st3lane:
15358 case Intrinsic::aarch64_neon_st4lane: {
15360 unsigned NumElts = 0;
15361 // all the vector type is same
15362 Type *VecTy = I.getArgOperand(0)->getType();
15363 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15364
15365 for (const Value *Arg : I.args()) {
15366 Type *ArgTy = Arg->getType();
15367 if (!ArgTy->isVectorTy())
15368 break;
15369 NumElts += 1;
15370 }
15371
15372 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15373 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15374 Info.offset = 0;
15375 Info.align.reset();
15376 // volatile stores with NEON intrinsics not supported
15378 return true;
15379 }
15380 case Intrinsic::aarch64_ldaxr:
15381 case Intrinsic::aarch64_ldxr: {
15382 Type *ValTy = I.getParamElementType(0);
15384 Info.memVT = MVT::getVT(ValTy);
15385 Info.ptrVal = I.getArgOperand(0);
15386 Info.offset = 0;
15387 Info.align = DL.getABITypeAlign(ValTy);
15389 return true;
15390 }
15391 case Intrinsic::aarch64_stlxr:
15392 case Intrinsic::aarch64_stxr: {
15393 Type *ValTy = I.getParamElementType(1);
15395 Info.memVT = MVT::getVT(ValTy);
15396 Info.ptrVal = I.getArgOperand(1);
15397 Info.offset = 0;
15398 Info.align = DL.getABITypeAlign(ValTy);
15400 return true;
15401 }
15402 case Intrinsic::aarch64_ldaxp:
15403 case Intrinsic::aarch64_ldxp:
15405 Info.memVT = MVT::i128;
15406 Info.ptrVal = I.getArgOperand(0);
15407 Info.offset = 0;
15408 Info.align = Align(16);
15410 return true;
15411 case Intrinsic::aarch64_stlxp:
15412 case Intrinsic::aarch64_stxp:
15414 Info.memVT = MVT::i128;
15415 Info.ptrVal = I.getArgOperand(2);
15416 Info.offset = 0;
15417 Info.align = Align(16);
15419 return true;
15420 case Intrinsic::aarch64_sve_ldnt1: {
15421 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15423 Info.memVT = MVT::getVT(I.getType());
15424 Info.ptrVal = I.getArgOperand(1);
15425 Info.offset = 0;
15426 Info.align = DL.getABITypeAlign(ElTy);
15428 return true;
15429 }
15430 case Intrinsic::aarch64_sve_stnt1: {
15431 Type *ElTy =
15432 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15434 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15435 Info.ptrVal = I.getArgOperand(2);
15436 Info.offset = 0;
15437 Info.align = DL.getABITypeAlign(ElTy);
15439 return true;
15440 }
15441 case Intrinsic::aarch64_mops_memset_tag: {
15442 Value *Dst = I.getArgOperand(0);
15443 Value *Val = I.getArgOperand(1);
15445 Info.memVT = MVT::getVT(Val->getType());
15446 Info.ptrVal = Dst;
15447 Info.offset = 0;
15448 Info.align = I.getParamAlign(0).valueOrOne();
15450 // The size of the memory being operated on is unknown at this point
15452 return true;
15453 }
15454 default:
15455 break;
15456 }
15457
15458 return false;
15459}
15460
15462 ISD::LoadExtType ExtTy,
15463 EVT NewVT) const {
15464 // TODO: This may be worth removing. Check regression tests for diffs.
15465 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15466 return false;
15467
15468 // If we're reducing the load width in order to avoid having to use an extra
15469 // instruction to do extension then it's probably a good idea.
15470 if (ExtTy != ISD::NON_EXTLOAD)
15471 return true;
15472 // Don't reduce load width if it would prevent us from combining a shift into
15473 // the offset.
15474 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15475 assert(Mem);
15476 const SDValue &Base = Mem->getBasePtr();
15477 if (Base.getOpcode() == ISD::ADD &&
15478 Base.getOperand(1).getOpcode() == ISD::SHL &&
15479 Base.getOperand(1).hasOneUse() &&
15480 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15481 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15482 if (Mem->getMemoryVT().isScalableVector())
15483 return false;
15484 // The shift can be combined if it matches the size of the value being
15485 // loaded (and so reducing the width would make it not match).
15486 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15487 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15488 if (ShiftAmount == Log2_32(LoadBytes))
15489 return false;
15490 }
15491 // We have no reason to disallow reducing the load width, so allow it.
15492 return true;
15493}
15494
15495// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15497 EVT VT = Extend.getValueType();
15498 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15499 SDValue Extract = Extend.getOperand(0);
15500 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15501 Extract = Extract.getOperand(0);
15502 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15503 EVT VecVT = Extract.getOperand(0).getValueType();
15504 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15505 return false;
15506 }
15507 }
15508 return true;
15509}
15510
15511// Truncations from 64-bit GPR to 32-bit GPR is free.
15513 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15514 return false;
15515 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15516 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15517 return NumBits1 > NumBits2;
15518}
15520 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15521 return false;
15522 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15523 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15524 return NumBits1 > NumBits2;
15525}
15526
15527/// Check if it is profitable to hoist instruction in then/else to if.
15528/// Not profitable if I and it's user can form a FMA instruction
15529/// because we prefer FMSUB/FMADD.
15531 if (I->getOpcode() != Instruction::FMul)
15532 return true;
15533
15534 if (!I->hasOneUse())
15535 return true;
15536
15537 Instruction *User = I->user_back();
15538
15539 if (!(User->getOpcode() == Instruction::FSub ||
15540 User->getOpcode() == Instruction::FAdd))
15541 return true;
15542
15544 const Function *F = I->getFunction();
15545 const DataLayout &DL = F->getDataLayout();
15546 Type *Ty = User->getOperand(0)->getType();
15547
15548 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15550 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15551 Options.UnsafeFPMath));
15552}
15553
15554// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15555// 64-bit GPR.
15557 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15558 return false;
15559 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15560 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15561 return NumBits1 == 32 && NumBits2 == 64;
15562}
15564 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15565 return false;
15566 unsigned NumBits1 = VT1.getSizeInBits();
15567 unsigned NumBits2 = VT2.getSizeInBits();
15568 return NumBits1 == 32 && NumBits2 == 64;
15569}
15570
15572 EVT VT1 = Val.getValueType();
15573 if (isZExtFree(VT1, VT2)) {
15574 return true;
15575 }
15576
15577 if (Val.getOpcode() != ISD::LOAD)
15578 return false;
15579
15580 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15581 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15582 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15583 VT1.getSizeInBits() <= 32);
15584}
15585
15586bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15587 if (isa<FPExtInst>(Ext))
15588 return false;
15589
15590 // Vector types are not free.
15591 if (Ext->getType()->isVectorTy())
15592 return false;
15593
15594 for (const Use &U : Ext->uses()) {
15595 // The extension is free if we can fold it with a left shift in an
15596 // addressing mode or an arithmetic operation: add, sub, and cmp.
15597
15598 // Is there a shift?
15599 const Instruction *Instr = cast<Instruction>(U.getUser());
15600
15601 // Is this a constant shift?
15602 switch (Instr->getOpcode()) {
15603 case Instruction::Shl:
15604 if (!isa<ConstantInt>(Instr->getOperand(1)))
15605 return false;
15606 break;
15607 case Instruction::GetElementPtr: {
15608 gep_type_iterator GTI = gep_type_begin(Instr);
15609 auto &DL = Ext->getDataLayout();
15610 std::advance(GTI, U.getOperandNo()-1);
15611 Type *IdxTy = GTI.getIndexedType();
15612 // This extension will end up with a shift because of the scaling factor.
15613 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15614 // Get the shift amount based on the scaling factor:
15615 // log2(sizeof(IdxTy)) - log2(8).
15616 if (IdxTy->isScalableTy())
15617 return false;
15618 uint64_t ShiftAmt =
15619 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15620 3;
15621 // Is the constant foldable in the shift of the addressing mode?
15622 // I.e., shift amount is between 1 and 4 inclusive.
15623 if (ShiftAmt == 0 || ShiftAmt > 4)
15624 return false;
15625 break;
15626 }
15627 case Instruction::Trunc:
15628 // Check if this is a noop.
15629 // trunc(sext ty1 to ty2) to ty1.
15630 if (Instr->getType() == Ext->getOperand(0)->getType())
15631 continue;
15632 [[fallthrough]];
15633 default:
15634 return false;
15635 }
15636
15637 // At this point we can use the bfm family, so this extension is free
15638 // for that use.
15639 }
15640 return true;
15641}
15642
15643static bool isSplatShuffle(Value *V) {
15644 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15645 return all_equal(Shuf->getShuffleMask());
15646 return false;
15647}
15648
15649/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15650/// or upper half of the vector elements.
15651static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15652 bool AllowSplat = false) {
15653 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15654 auto *FullTy = FullV->getType();
15655 auto *HalfTy = HalfV->getType();
15656 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15657 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15658 };
15659
15660 auto extractHalf = [](Value *FullV, Value *HalfV) {
15661 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15662 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15663 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15664 };
15665
15666 ArrayRef<int> M1, M2;
15667 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15668 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15669 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15670 return false;
15671
15672 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15673 // it is not checked as an extract below.
15674 if (AllowSplat && isSplatShuffle(Op1))
15675 S1Op1 = nullptr;
15676 if (AllowSplat && isSplatShuffle(Op2))
15677 S2Op1 = nullptr;
15678
15679 // Check that the operands are half as wide as the result and we extract
15680 // half of the elements of the input vectors.
15681 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15682 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15683 return false;
15684
15685 // Check the mask extracts either the lower or upper half of vector
15686 // elements.
15687 int M1Start = 0;
15688 int M2Start = 0;
15689 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15690 if ((S1Op1 &&
15691 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15692 (S2Op1 &&
15693 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15694 return false;
15695
15696 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15697 (M2Start != 0 && M2Start != (NumElements / 2)))
15698 return false;
15699 if (S1Op1 && S2Op1 && M1Start != M2Start)
15700 return false;
15701
15702 return true;
15703}
15704
15705/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15706/// of the vector elements.
15707static bool areExtractExts(Value *Ext1, Value *Ext2) {
15708 auto areExtDoubled = [](Instruction *Ext) {
15709 return Ext->getType()->getScalarSizeInBits() ==
15710 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15711 };
15712
15713 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15714 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15715 !areExtDoubled(cast<Instruction>(Ext1)) ||
15716 !areExtDoubled(cast<Instruction>(Ext2)))
15717 return false;
15718
15719 return true;
15720}
15721
15722/// Check if Op could be used with vmull_high_p64 intrinsic.
15724 Value *VectorOperand = nullptr;
15725 ConstantInt *ElementIndex = nullptr;
15726 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15727 m_ConstantInt(ElementIndex))) &&
15728 ElementIndex->getValue() == 1 &&
15729 isa<FixedVectorType>(VectorOperand->getType()) &&
15730 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15731}
15732
15733/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15734static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15736}
15737
15739 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15740 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15741 if (!GEP || GEP->getNumOperands() != 2)
15742 return false;
15743
15744 Value *Base = GEP->getOperand(0);
15745 Value *Offsets = GEP->getOperand(1);
15746
15747 // We only care about scalar_base+vector_offsets.
15748 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15749 return false;
15750
15751 // Sink extends that would allow us to use 32-bit offset vectors.
15752 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15753 auto *OffsetsInst = cast<Instruction>(Offsets);
15754 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15755 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15756 Ops.push_back(&GEP->getOperandUse(1));
15757 }
15758
15759 // Sink the GEP.
15760 return true;
15761}
15762
15763/// We want to sink following cases:
15764/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
15765/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
15767 if (match(Op, m_VScale()))
15768 return true;
15769 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15771 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15772 return true;
15773 }
15774 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
15776 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
15777 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
15778 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15779 return true;
15780 }
15781 return false;
15782}
15783
15784/// Check if sinking \p I's operands to I's basic block is profitable, because
15785/// the operands can be folded into a target instruction, e.g.
15786/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15788 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15789 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15790 switch (II->getIntrinsicID()) {
15791 case Intrinsic::aarch64_neon_smull:
15792 case Intrinsic::aarch64_neon_umull:
15793 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15794 /*AllowSplat=*/true)) {
15795 Ops.push_back(&II->getOperandUse(0));
15796 Ops.push_back(&II->getOperandUse(1));
15797 return true;
15798 }
15799 [[fallthrough]];
15800
15801 case Intrinsic::fma:
15802 if (isa<VectorType>(I->getType()) &&
15803 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15804 !Subtarget->hasFullFP16())
15805 return false;
15806 [[fallthrough]];
15807 case Intrinsic::aarch64_neon_sqdmull:
15808 case Intrinsic::aarch64_neon_sqdmulh:
15809 case Intrinsic::aarch64_neon_sqrdmulh:
15810 // Sink splats for index lane variants
15811 if (isSplatShuffle(II->getOperand(0)))
15812 Ops.push_back(&II->getOperandUse(0));
15813 if (isSplatShuffle(II->getOperand(1)))
15814 Ops.push_back(&II->getOperandUse(1));
15815 return !Ops.empty();
15816 case Intrinsic::aarch64_neon_fmlal:
15817 case Intrinsic::aarch64_neon_fmlal2:
15818 case Intrinsic::aarch64_neon_fmlsl:
15819 case Intrinsic::aarch64_neon_fmlsl2:
15820 // Sink splats for index lane variants
15821 if (isSplatShuffle(II->getOperand(1)))
15822 Ops.push_back(&II->getOperandUse(1));
15823 if (isSplatShuffle(II->getOperand(2)))
15824 Ops.push_back(&II->getOperandUse(2));
15825 return !Ops.empty();
15826 case Intrinsic::aarch64_sve_ptest_first:
15827 case Intrinsic::aarch64_sve_ptest_last:
15828 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15829 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15830 Ops.push_back(&II->getOperandUse(0));
15831 return !Ops.empty();
15832 case Intrinsic::aarch64_sme_write_horiz:
15833 case Intrinsic::aarch64_sme_write_vert:
15834 case Intrinsic::aarch64_sme_writeq_horiz:
15835 case Intrinsic::aarch64_sme_writeq_vert: {
15836 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15837 if (!Idx || Idx->getOpcode() != Instruction::Add)
15838 return false;
15839 Ops.push_back(&II->getOperandUse(1));
15840 return true;
15841 }
15842 case Intrinsic::aarch64_sme_read_horiz:
15843 case Intrinsic::aarch64_sme_read_vert:
15844 case Intrinsic::aarch64_sme_readq_horiz:
15845 case Intrinsic::aarch64_sme_readq_vert:
15846 case Intrinsic::aarch64_sme_ld1b_vert:
15847 case Intrinsic::aarch64_sme_ld1h_vert:
15848 case Intrinsic::aarch64_sme_ld1w_vert:
15849 case Intrinsic::aarch64_sme_ld1d_vert:
15850 case Intrinsic::aarch64_sme_ld1q_vert:
15851 case Intrinsic::aarch64_sme_st1b_vert:
15852 case Intrinsic::aarch64_sme_st1h_vert:
15853 case Intrinsic::aarch64_sme_st1w_vert:
15854 case Intrinsic::aarch64_sme_st1d_vert:
15855 case Intrinsic::aarch64_sme_st1q_vert:
15856 case Intrinsic::aarch64_sme_ld1b_horiz:
15857 case Intrinsic::aarch64_sme_ld1h_horiz:
15858 case Intrinsic::aarch64_sme_ld1w_horiz:
15859 case Intrinsic::aarch64_sme_ld1d_horiz:
15860 case Intrinsic::aarch64_sme_ld1q_horiz:
15861 case Intrinsic::aarch64_sme_st1b_horiz:
15862 case Intrinsic::aarch64_sme_st1h_horiz:
15863 case Intrinsic::aarch64_sme_st1w_horiz:
15864 case Intrinsic::aarch64_sme_st1d_horiz:
15865 case Intrinsic::aarch64_sme_st1q_horiz: {
15866 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15867 if (!Idx || Idx->getOpcode() != Instruction::Add)
15868 return false;
15869 Ops.push_back(&II->getOperandUse(3));
15870 return true;
15871 }
15872 case Intrinsic::aarch64_neon_pmull:
15873 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15874 return false;
15875 Ops.push_back(&II->getOperandUse(0));
15876 Ops.push_back(&II->getOperandUse(1));
15877 return true;
15878 case Intrinsic::aarch64_neon_pmull64:
15879 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15880 II->getArgOperand(1)))
15881 return false;
15882 Ops.push_back(&II->getArgOperandUse(0));
15883 Ops.push_back(&II->getArgOperandUse(1));
15884 return true;
15885 case Intrinsic::masked_gather:
15886 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15887 return false;
15888 Ops.push_back(&II->getArgOperandUse(0));
15889 return true;
15890 case Intrinsic::masked_scatter:
15891 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15892 return false;
15893 Ops.push_back(&II->getArgOperandUse(1));
15894 return true;
15895 default:
15896 return false;
15897 }
15898 }
15899
15900 // Sink vscales closer to uses for better isel
15901 switch (I->getOpcode()) {
15902 case Instruction::GetElementPtr:
15903 case Instruction::Add:
15904 case Instruction::Sub:
15905 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15906 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15907 Ops.push_back(&I->getOperandUse(Op));
15908 return true;
15909 }
15910 }
15911 break;
15912 default:
15913 break;
15914 }
15915
15916 if (!I->getType()->isVectorTy())
15917 return false;
15918
15919 switch (I->getOpcode()) {
15920 case Instruction::Sub:
15921 case Instruction::Add: {
15922 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15923 return false;
15924
15925 // If the exts' operands extract either the lower or upper elements, we
15926 // can sink them too.
15927 auto Ext1 = cast<Instruction>(I->getOperand(0));
15928 auto Ext2 = cast<Instruction>(I->getOperand(1));
15929 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15930 Ops.push_back(&Ext1->getOperandUse(0));
15931 Ops.push_back(&Ext2->getOperandUse(0));
15932 }
15933
15934 Ops.push_back(&I->getOperandUse(0));
15935 Ops.push_back(&I->getOperandUse(1));
15936
15937 return true;
15938 }
15939 case Instruction::Or: {
15940 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15941 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15942 if (Subtarget->hasNEON()) {
15943 Instruction *OtherAnd, *IA, *IB;
15944 Value *MaskValue;
15945 // MainAnd refers to And instruction that has 'Not' as one of its operands
15946 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15947 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15948 m_Instruction(IA)))))) {
15949 if (match(OtherAnd,
15950 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15951 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15952 ? cast<Instruction>(I->getOperand(1))
15953 : cast<Instruction>(I->getOperand(0));
15954
15955 // Both Ands should be in same basic block as Or
15956 if (I->getParent() != MainAnd->getParent() ||
15957 I->getParent() != OtherAnd->getParent())
15958 return false;
15959
15960 // Non-mask operands of both Ands should also be in same basic block
15961 if (I->getParent() != IA->getParent() ||
15962 I->getParent() != IB->getParent())
15963 return false;
15964
15965 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15966 Ops.push_back(&I->getOperandUse(0));
15967 Ops.push_back(&I->getOperandUse(1));
15968
15969 return true;
15970 }
15971 }
15972 }
15973
15974 return false;
15975 }
15976 case Instruction::Mul: {
15977 int NumZExts = 0, NumSExts = 0;
15978 for (auto &Op : I->operands()) {
15979 // Make sure we are not already sinking this operand
15980 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15981 continue;
15982
15983 if (match(&Op, m_SExt(m_Value()))) {
15984 NumSExts++;
15985 continue;
15986 } else if (match(&Op, m_ZExt(m_Value()))) {
15987 NumZExts++;
15988 continue;
15989 }
15990
15991 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15992
15993 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15994 // operand and the s/zext can help create indexed s/umull. This is
15995 // especially useful to prevent i64 mul being scalarized.
15996 if (Shuffle && isSplatShuffle(Shuffle) &&
15997 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15998 Ops.push_back(&Shuffle->getOperandUse(0));
15999 Ops.push_back(&Op);
16000 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
16001 NumSExts++;
16002 else
16003 NumZExts++;
16004 continue;
16005 }
16006
16007 if (!Shuffle)
16008 continue;
16009
16010 Value *ShuffleOperand = Shuffle->getOperand(0);
16011 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
16012 if (!Insert)
16013 continue;
16014
16015 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
16016 if (!OperandInstr)
16017 continue;
16018
16019 ConstantInt *ElementConstant =
16020 dyn_cast<ConstantInt>(Insert->getOperand(2));
16021 // Check that the insertelement is inserting into element 0
16022 if (!ElementConstant || !ElementConstant->isZero())
16023 continue;
16024
16025 unsigned Opcode = OperandInstr->getOpcode();
16026 if (Opcode == Instruction::SExt)
16027 NumSExts++;
16028 else if (Opcode == Instruction::ZExt)
16029 NumZExts++;
16030 else {
16031 // If we find that the top bits are known 0, then we can sink and allow
16032 // the backend to generate a umull.
16033 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
16034 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
16035 const DataLayout &DL = I->getDataLayout();
16036 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
16037 continue;
16038 NumZExts++;
16039 }
16040
16041 Ops.push_back(&Shuffle->getOperandUse(0));
16042 Ops.push_back(&Op);
16043 }
16044
16045 // Is it profitable to sink if we found two of the same type of extends.
16046 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
16047 }
16048 default:
16049 return false;
16050 }
16051 return false;
16052}
16053
16054static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16055 unsigned NumElts, bool IsLittleEndian,
16056 SmallVectorImpl<int> &Mask) {
16057 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
16058 return false;
16059
16060 assert(DstWidth % SrcWidth == 0 &&
16061 "TBL lowering is not supported for a conversion instruction with this "
16062 "source and destination element type.");
16063
16064 unsigned Factor = DstWidth / SrcWidth;
16065 unsigned MaskLen = NumElts * Factor;
16066
16067 Mask.clear();
16068 Mask.resize(MaskLen, NumElts);
16069
16070 unsigned SrcIndex = 0;
16071 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16072 Mask[I] = SrcIndex++;
16073
16074 return true;
16075}
16076
16078 FixedVectorType *ZExtTy,
16079 FixedVectorType *DstTy,
16080 bool IsLittleEndian) {
16081 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16082 unsigned NumElts = SrcTy->getNumElements();
16083 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16084 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16085
16086 SmallVector<int> Mask;
16087 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16088 return nullptr;
16089
16090 auto *FirstEltZero = Builder.CreateInsertElement(
16091 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
16092 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16093 Result = Builder.CreateBitCast(Result, DstTy);
16094 if (DstTy != ZExtTy)
16095 Result = Builder.CreateZExt(Result, ZExtTy);
16096 return Result;
16097}
16098
16099static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16100 IRBuilder<> Builder(TI);
16102 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16103 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16104 auto *DstTy = cast<FixedVectorType>(TI->getType());
16105 assert(SrcTy->getElementType()->isIntegerTy() &&
16106 "Non-integer type source vector element is not supported");
16107 assert(DstTy->getElementType()->isIntegerTy(8) &&
16108 "Unsupported destination vector element type");
16109 unsigned SrcElemTySz =
16110 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16111 unsigned DstElemTySz =
16112 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16113 assert((SrcElemTySz % DstElemTySz == 0) &&
16114 "Cannot lower truncate to tbl instructions for a source element size "
16115 "that is not divisible by the destination element size");
16116 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16117 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16118 "Unsupported source vector element type size");
16119 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16120
16121 // Create a mask to choose every nth byte from the source vector table of
16122 // bytes to create the truncated destination vector, where 'n' is the truncate
16123 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16124 // 0,8,16,..Y*8th bytes for the little-endian format
16126 for (int Itr = 0; Itr < 16; Itr++) {
16127 if (Itr < NumElements)
16128 MaskConst.push_back(Builder.getInt8(
16129 IsLittleEndian ? Itr * TruncFactor
16130 : Itr * TruncFactor + (TruncFactor - 1)));
16131 else
16132 MaskConst.push_back(Builder.getInt8(255));
16133 }
16134
16135 int MaxTblSz = 128 * 4;
16136 int MaxSrcSz = SrcElemTySz * NumElements;
16137 int ElemsPerTbl =
16138 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16139 assert(ElemsPerTbl <= 16 &&
16140 "Maximum elements selected using TBL instruction cannot exceed 16!");
16141
16142 int ShuffleCount = 128 / SrcElemTySz;
16143 SmallVector<int> ShuffleLanes;
16144 for (int i = 0; i < ShuffleCount; ++i)
16145 ShuffleLanes.push_back(i);
16146
16147 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16148 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16149 // call TBL & save the result in a vector of TBL results for combining later.
16151 while (ShuffleLanes.back() < NumElements) {
16152 Parts.push_back(Builder.CreateBitCast(
16153 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16154
16155 if (Parts.size() == 4) {
16157 Intrinsic::aarch64_neon_tbl4, VecTy);
16158 Parts.push_back(ConstantVector::get(MaskConst));
16159 Results.push_back(Builder.CreateCall(F, Parts));
16160 Parts.clear();
16161 }
16162
16163 for (int i = 0; i < ShuffleCount; ++i)
16164 ShuffleLanes[i] += ShuffleCount;
16165 }
16166
16167 assert((Parts.empty() || Results.empty()) &&
16168 "Lowering trunc for vectors requiring different TBL instructions is "
16169 "not supported!");
16170 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16171 // registers
16172 if (!Parts.empty()) {
16173 Intrinsic::ID TblID;
16174 switch (Parts.size()) {
16175 case 1:
16176 TblID = Intrinsic::aarch64_neon_tbl1;
16177 break;
16178 case 2:
16179 TblID = Intrinsic::aarch64_neon_tbl2;
16180 break;
16181 case 3:
16182 TblID = Intrinsic::aarch64_neon_tbl3;
16183 break;
16184 }
16185
16186 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
16187 Parts.push_back(ConstantVector::get(MaskConst));
16188 Results.push_back(Builder.CreateCall(F, Parts));
16189 }
16190
16191 // Extract the destination vector from TBL result(s) after combining them
16192 // where applicable. Currently, at most two TBLs are supported.
16193 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16194 "more than 2 tbl instructions!");
16195 Value *FinalResult = Results[0];
16196 if (Results.size() == 1) {
16197 if (ElemsPerTbl < 16) {
16198 SmallVector<int> FinalMask(ElemsPerTbl);
16199 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16200 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16201 }
16202 } else {
16203 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16204 if (ElemsPerTbl < 16) {
16205 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16206 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16207 } else {
16208 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16209 }
16210 FinalResult =
16211 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16212 }
16213
16214 TI->replaceAllUsesWith(FinalResult);
16215 TI->eraseFromParent();
16216}
16217
16219 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16220 // shuffle_vector instructions are serialized when targeting SVE,
16221 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16222 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16223 return false;
16224
16225 // Try to optimize conversions using tbl. This requires materializing constant
16226 // index vectors, which can increase code size and add loads. Skip the
16227 // transform unless the conversion is in a loop block guaranteed to execute
16228 // and we are not optimizing for size.
16229 Function *F = I->getParent()->getParent();
16230 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16231 F->hasOptSize())
16232 return false;
16233
16234 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16235 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16236 if (!SrcTy || !DstTy)
16237 return false;
16238
16239 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16240 // lowered to tbl instructions to insert the original i8 elements
16241 // into i8x lanes. This is enabled for cases where it is beneficial.
16242 auto *ZExt = dyn_cast<ZExtInst>(I);
16243 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16244 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16245 if (DstWidth % 8 != 0)
16246 return false;
16247
16248 auto *TruncDstType =
16249 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16250 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16251 // the remaining ZExt folded into the user, don't use tbl lowering.
16252 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16253 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16256 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16257 return false;
16258
16259 DstTy = TruncDstType;
16260 }
16261 IRBuilder<> Builder(ZExt);
16263 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16264 DstTy, Subtarget->isLittleEndian());
16265 if (!Result)
16266 return false;
16267 ZExt->replaceAllUsesWith(Result);
16268 ZExt->eraseFromParent();
16269 return true;
16270 }
16271
16272 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16273 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16274 DstTy->getElementType()->isFloatTy()) {
16275 IRBuilder<> Builder(I);
16277 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16278 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16279 if (!ZExt)
16280 return false;
16281 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16282 I->replaceAllUsesWith(UI);
16283 I->eraseFromParent();
16284 return true;
16285 }
16286
16287 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16288 // followed by a truncate lowered to using tbl.4.
16289 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16290 if (FPToUI &&
16291 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16292 SrcTy->getElementType()->isFloatTy() &&
16293 DstTy->getElementType()->isIntegerTy(8)) {
16294 IRBuilder<> Builder(I);
16295 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16296 VectorType::getInteger(SrcTy));
16297 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16298 I->replaceAllUsesWith(TruncI);
16299 I->eraseFromParent();
16300 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16301 return true;
16302 }
16303
16304 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16305 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16306 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16307 // registers
16308 auto *TI = dyn_cast<TruncInst>(I);
16309 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16310 ((SrcTy->getElementType()->isIntegerTy(32) ||
16311 SrcTy->getElementType()->isIntegerTy(64)) &&
16312 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16313 createTblForTrunc(TI, Subtarget->isLittleEndian());
16314 return true;
16315 }
16316
16317 return false;
16318}
16319
16321 Align &RequiredAligment) const {
16322 if (!LoadedType.isSimple() ||
16323 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16324 return false;
16325 // Cyclone supports unaligned accesses.
16326 RequiredAligment = Align(1);
16327 unsigned NumBits = LoadedType.getSizeInBits();
16328 return NumBits == 32 || NumBits == 64;
16329}
16330
16331/// A helper function for determining the number of interleaved accesses we
16332/// will generate when lowering accesses of the given type.
16334 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16335 unsigned VecSize = 128;
16336 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16337 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16338 if (UseScalable && isa<FixedVectorType>(VecTy))
16339 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16340 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
16341}
16342
16345 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16346 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16347 return MOStridedAccess;
16349}
16350
16352 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
16353 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16354 auto EC = VecTy->getElementCount();
16355 unsigned MinElts = EC.getKnownMinValue();
16356
16357 UseScalable = false;
16358
16359 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
16360 (!Subtarget->useSVEForFixedLengthVectors() ||
16362 return false;
16363
16364 if (isa<ScalableVectorType>(VecTy) &&
16365 !Subtarget->isSVEorStreamingSVEAvailable())
16366 return false;
16367
16368 // Ensure the number of vector elements is greater than 1.
16369 if (MinElts < 2)
16370 return false;
16371
16372 // Ensure the element type is legal.
16373 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
16374 return false;
16375
16376 if (EC.isScalable()) {
16377 UseScalable = true;
16378 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16379 }
16380
16381 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16382 if (Subtarget->useSVEForFixedLengthVectors()) {
16383 unsigned MinSVEVectorSize =
16384 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16385 if (VecSize % MinSVEVectorSize == 0 ||
16386 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16387 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16388 UseScalable = true;
16389 return true;
16390 }
16391 }
16392
16393 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16394 // 128 will be split into multiple interleaved accesses.
16395 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16396}
16397
16399 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16400 return ScalableVectorType::get(VTy->getElementType(), 2);
16401
16402 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16403 return ScalableVectorType::get(VTy->getElementType(), 4);
16404
16405 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16406 return ScalableVectorType::get(VTy->getElementType(), 8);
16407
16408 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16409 return ScalableVectorType::get(VTy->getElementType(), 8);
16410
16411 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16412 return ScalableVectorType::get(VTy->getElementType(), 2);
16413
16414 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16415 return ScalableVectorType::get(VTy->getElementType(), 4);
16416
16417 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16418 return ScalableVectorType::get(VTy->getElementType(), 8);
16419
16420 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16421 return ScalableVectorType::get(VTy->getElementType(), 16);
16422
16423 llvm_unreachable("Cannot handle input vector type");
16424}
16425
16426static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16427 bool Scalable, Type *LDVTy,
16428 Type *PtrTy) {
16429 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16430 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16431 Intrinsic::aarch64_sve_ld3_sret,
16432 Intrinsic::aarch64_sve_ld4_sret};
16433 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16434 Intrinsic::aarch64_neon_ld3,
16435 Intrinsic::aarch64_neon_ld4};
16436 if (Scalable)
16437 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16438
16439 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16440}
16441
16442static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16443 bool Scalable, Type *STVTy,
16444 Type *PtrTy) {
16445 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16446 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16447 Intrinsic::aarch64_sve_st3,
16448 Intrinsic::aarch64_sve_st4};
16449 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16450 Intrinsic::aarch64_neon_st3,
16451 Intrinsic::aarch64_neon_st4};
16452 if (Scalable)
16453 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16454
16455 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16456}
16457
16458/// Lower an interleaved load into a ldN intrinsic.
16459///
16460/// E.g. Lower an interleaved load (Factor = 2):
16461/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16462/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16463/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16464///
16465/// Into:
16466/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16467/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16468/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16471 ArrayRef<unsigned> Indices, unsigned Factor) const {
16472 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16473 "Invalid interleave factor");
16474 assert(!Shuffles.empty() && "Empty shufflevector input");
16475 assert(Shuffles.size() == Indices.size() &&
16476 "Unmatched number of shufflevectors and indices");
16477
16478 const DataLayout &DL = LI->getDataLayout();
16479
16480 VectorType *VTy = Shuffles[0]->getType();
16481
16482 // Skip if we do not have NEON and skip illegal vector types. We can
16483 // "legalize" wide vector types into multiple interleaved accesses as long as
16484 // the vector types are divisible by 128.
16485 bool UseScalable;
16486 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16487 return false;
16488
16489 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16490
16491 auto *FVTy = cast<FixedVectorType>(VTy);
16492
16493 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16494 // load integer vectors first and then convert to pointer vectors.
16495 Type *EltTy = FVTy->getElementType();
16496 if (EltTy->isPointerTy())
16497 FVTy =
16498 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16499
16500 // If we're going to generate more than one load, reset the sub-vector type
16501 // to something legal.
16502 FVTy = FixedVectorType::get(FVTy->getElementType(),
16503 FVTy->getNumElements() / NumLoads);
16504
16505 auto *LDVTy =
16506 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16507
16508 IRBuilder<> Builder(LI);
16509
16510 // The base address of the load.
16511 Value *BaseAddr = LI->getPointerOperand();
16512
16513 Type *PtrTy = LI->getPointerOperandType();
16514 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16515 LDVTy->getElementCount());
16516
16517 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16518 UseScalable, LDVTy, PtrTy);
16519
16520 // Holds sub-vectors extracted from the load intrinsic return values. The
16521 // sub-vectors are associated with the shufflevector instructions they will
16522 // replace.
16524
16525 Value *PTrue = nullptr;
16526 if (UseScalable) {
16527 std::optional<unsigned> PgPattern =
16528 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16529 if (Subtarget->getMinSVEVectorSizeInBits() ==
16530 Subtarget->getMaxSVEVectorSizeInBits() &&
16531 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16532 PgPattern = AArch64SVEPredPattern::all;
16533
16534 auto *PTruePat =
16535 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16536 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16537 {PTruePat});
16538 }
16539
16540 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16541
16542 // If we're generating more than one load, compute the base address of
16543 // subsequent loads as an offset from the previous.
16544 if (LoadCount > 0)
16545 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16546 FVTy->getNumElements() * Factor);
16547
16548 CallInst *LdN;
16549 if (UseScalable)
16550 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16551 else
16552 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16553
16554 // Extract and store the sub-vectors returned by the load intrinsic.
16555 for (unsigned i = 0; i < Shuffles.size(); i++) {
16556 ShuffleVectorInst *SVI = Shuffles[i];
16557 unsigned Index = Indices[i];
16558
16559 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16560
16561 if (UseScalable)
16562 SubVec = Builder.CreateExtractVector(
16563 FVTy, SubVec,
16564 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16565
16566 // Convert the integer vector to pointer vector if the element is pointer.
16567 if (EltTy->isPointerTy())
16568 SubVec = Builder.CreateIntToPtr(
16570 FVTy->getNumElements()));
16571
16572 SubVecs[SVI].push_back(SubVec);
16573 }
16574 }
16575
16576 // Replace uses of the shufflevector instructions with the sub-vectors
16577 // returned by the load intrinsic. If a shufflevector instruction is
16578 // associated with more than one sub-vector, those sub-vectors will be
16579 // concatenated into a single wide vector.
16580 for (ShuffleVectorInst *SVI : Shuffles) {
16581 auto &SubVec = SubVecs[SVI];
16582 auto *WideVec =
16583 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16584 SVI->replaceAllUsesWith(WideVec);
16585 }
16586
16587 return true;
16588}
16589
16590template <typename Iter>
16591bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16592 int MaxLookupDist = 20;
16593 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16594 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16595 const Value *PtrA1 =
16596 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16597
16598 while (++It != End) {
16599 if (It->isDebugOrPseudoInst())
16600 continue;
16601 if (MaxLookupDist-- == 0)
16602 break;
16603 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16604 const Value *PtrB1 =
16605 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16606 DL, OffsetB);
16607 if (PtrA1 == PtrB1 &&
16608 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16609 .abs() == 16)
16610 return true;
16611 }
16612 }
16613
16614 return false;
16615}
16616
16617/// Lower an interleaved store into a stN intrinsic.
16618///
16619/// E.g. Lower an interleaved store (Factor = 3):
16620/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16621/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16622/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16623///
16624/// Into:
16625/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16626/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16627/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16628/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16629///
16630/// Note that the new shufflevectors will be removed and we'll only generate one
16631/// st3 instruction in CodeGen.
16632///
16633/// Example for a more general valid mask (Factor 3). Lower:
16634/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16635/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16636/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16637///
16638/// Into:
16639/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16640/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16641/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16642/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16644 ShuffleVectorInst *SVI,
16645 unsigned Factor) const {
16646
16647 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16648 "Invalid interleave factor");
16649
16650 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16651 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16652
16653 unsigned LaneLen = VecTy->getNumElements() / Factor;
16654 Type *EltTy = VecTy->getElementType();
16655 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16656
16657 const DataLayout &DL = SI->getDataLayout();
16658 bool UseScalable;
16659
16660 // Skip if we do not have NEON and skip illegal vector types. We can
16661 // "legalize" wide vector types into multiple interleaved accesses as long as
16662 // the vector types are divisible by 128.
16663 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16664 return false;
16665
16666 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16667
16668 Value *Op0 = SVI->getOperand(0);
16669 Value *Op1 = SVI->getOperand(1);
16670 IRBuilder<> Builder(SI);
16671
16672 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16673 // vectors to integer vectors.
16674 if (EltTy->isPointerTy()) {
16675 Type *IntTy = DL.getIntPtrType(EltTy);
16676 unsigned NumOpElts =
16677 cast<FixedVectorType>(Op0->getType())->getNumElements();
16678
16679 // Convert to the corresponding integer vector.
16680 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16681 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16682 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16683
16684 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16685 }
16686
16687 // If we're going to generate more than one store, reset the lane length
16688 // and sub-vector type to something legal.
16689 LaneLen /= NumStores;
16690 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16691
16692 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16693 : SubVecTy;
16694
16695 // The base address of the store.
16696 Value *BaseAddr = SI->getPointerOperand();
16697
16698 auto Mask = SVI->getShuffleMask();
16699
16700 // Sanity check if all the indices are NOT in range.
16701 // If mask is `poison`, `Mask` may be a vector of -1s.
16702 // If all of them are `poison`, OOB read will happen later.
16703 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16704 return false;
16705 }
16706 // A 64bit st2 which does not start at element 0 will involved adding extra
16707 // ext elements making the st2 unprofitable, and if there is a nearby store
16708 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16709 // zip;ldp pair which has higher throughput.
16710 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16711 (Mask[0] != 0 ||
16712 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16713 DL) ||
16714 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16715 BaseAddr, DL)))
16716 return false;
16717
16718 Type *PtrTy = SI->getPointerOperandType();
16719 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16720 STVTy->getElementCount());
16721
16722 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16723 UseScalable, STVTy, PtrTy);
16724
16725 Value *PTrue = nullptr;
16726 if (UseScalable) {
16727 std::optional<unsigned> PgPattern =
16728 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16729 if (Subtarget->getMinSVEVectorSizeInBits() ==
16730 Subtarget->getMaxSVEVectorSizeInBits() &&
16731 Subtarget->getMinSVEVectorSizeInBits() ==
16732 DL.getTypeSizeInBits(SubVecTy))
16733 PgPattern = AArch64SVEPredPattern::all;
16734
16735 auto *PTruePat =
16736 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16737 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16738 {PTruePat});
16739 }
16740
16741 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16742
16744
16745 // Split the shufflevector operands into sub vectors for the new stN call.
16746 for (unsigned i = 0; i < Factor; i++) {
16747 Value *Shuffle;
16748 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16749 if (Mask[IdxI] >= 0) {
16750 Shuffle = Builder.CreateShuffleVector(
16751 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16752 } else {
16753 unsigned StartMask = 0;
16754 for (unsigned j = 1; j < LaneLen; j++) {
16755 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16756 if (Mask[IdxJ] >= 0) {
16757 StartMask = Mask[IdxJ] - j;
16758 break;
16759 }
16760 }
16761 // Note: Filling undef gaps with random elements is ok, since
16762 // those elements were being written anyway (with undefs).
16763 // In the case of all undefs we're defaulting to using elems from 0
16764 // Note: StartMask cannot be negative, it's checked in
16765 // isReInterleaveMask
16766 Shuffle = Builder.CreateShuffleVector(
16767 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16768 }
16769
16770 if (UseScalable)
16771 Shuffle = Builder.CreateInsertVector(
16772 STVTy, UndefValue::get(STVTy), Shuffle,
16773 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16774
16775 Ops.push_back(Shuffle);
16776 }
16777
16778 if (UseScalable)
16779 Ops.push_back(PTrue);
16780
16781 // If we generating more than one store, we compute the base address of
16782 // subsequent stores as an offset from the previous.
16783 if (StoreCount > 0)
16784 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16785 BaseAddr, LaneLen * Factor);
16786
16787 Ops.push_back(BaseAddr);
16788 Builder.CreateCall(StNFunc, Ops);
16789 }
16790 return true;
16791}
16792
16794 IntrinsicInst *DI, LoadInst *LI) const {
16795 // Only deinterleave2 supported at present.
16796 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16797 return false;
16798
16799 // Only a factor of 2 supported at present.
16800 const unsigned Factor = 2;
16801
16802 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16803 const DataLayout &DL = DI->getDataLayout();
16804 bool UseScalable;
16805 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16806 return false;
16807
16808 // TODO: Add support for using SVE instructions with fixed types later, using
16809 // the code from lowerInterleavedLoad to obtain the correct container type.
16810 if (UseScalable && !VTy->isScalableTy())
16811 return false;
16812
16813 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16814
16815 VectorType *LdTy =
16817 VTy->getElementCount().divideCoefficientBy(NumLoads));
16818
16819 Type *PtrTy = LI->getPointerOperandType();
16820 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16821 UseScalable, LdTy, PtrTy);
16822
16823 IRBuilder<> Builder(LI);
16824
16825 Value *Pred = nullptr;
16826 if (UseScalable)
16827 Pred =
16828 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16829
16830 Value *BaseAddr = LI->getPointerOperand();
16831 Value *Result;
16832 if (NumLoads > 1) {
16833 Value *Left = PoisonValue::get(VTy);
16835
16836 for (unsigned I = 0; I < NumLoads; ++I) {
16837 Value *Offset = Builder.getInt64(I * Factor);
16838
16839 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16840 Value *LdN = nullptr;
16841 if (UseScalable)
16842 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16843 else
16844 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16845
16846 Value *Idx =
16847 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16848 Left = Builder.CreateInsertVector(
16849 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16850 Right = Builder.CreateInsertVector(
16851 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16852 }
16853
16854 Result = PoisonValue::get(DI->getType());
16855 Result = Builder.CreateInsertValue(Result, Left, 0);
16856 Result = Builder.CreateInsertValue(Result, Right, 1);
16857 } else {
16858 if (UseScalable)
16859 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16860 else
16861 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16862 }
16863
16864 DI->replaceAllUsesWith(Result);
16865 return true;
16866}
16867
16869 IntrinsicInst *II, StoreInst *SI) const {
16870 // Only interleave2 supported at present.
16871 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16872 return false;
16873
16874 // Only a factor of 2 supported at present.
16875 const unsigned Factor = 2;
16876
16877 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16878 const DataLayout &DL = II->getDataLayout();
16879 bool UseScalable;
16880 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16881 return false;
16882
16883 // TODO: Add support for using SVE instructions with fixed types later, using
16884 // the code from lowerInterleavedStore to obtain the correct container type.
16885 if (UseScalable && !VTy->isScalableTy())
16886 return false;
16887
16888 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16889
16890 VectorType *StTy =
16892 VTy->getElementCount().divideCoefficientBy(NumStores));
16893
16894 Type *PtrTy = SI->getPointerOperandType();
16895 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16896 UseScalable, StTy, PtrTy);
16897
16898 IRBuilder<> Builder(SI);
16899
16900 Value *BaseAddr = SI->getPointerOperand();
16901 Value *Pred = nullptr;
16902
16903 if (UseScalable)
16904 Pred =
16905 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16906
16907 Value *L = II->getOperand(0);
16908 Value *R = II->getOperand(1);
16909
16910 for (unsigned I = 0; I < NumStores; ++I) {
16911 Value *Address = BaseAddr;
16912 if (NumStores > 1) {
16913 Value *Offset = Builder.getInt64(I * Factor);
16914 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16915
16916 Value *Idx =
16917 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16918 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16919 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16920 }
16921
16922 if (UseScalable)
16923 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16924 else
16925 Builder.CreateCall(StNFunc, {L, R, Address});
16926 }
16927
16928 return true;
16929}
16930
16932 const MemOp &Op, const AttributeList &FuncAttributes) const {
16933 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16934 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16935 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16936 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16937 // taken one instruction to materialize the v2i64 zero and one store (with
16938 // restrictive addressing mode). Just do i64 stores.
16939 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16940 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16941 if (Op.isAligned(AlignCheck))
16942 return true;
16943 unsigned Fast;
16944 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16946 Fast;
16947 };
16948
16949 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16950 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16951 return MVT::v16i8;
16952 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16953 return MVT::f128;
16954 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16955 return MVT::i64;
16956 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16957 return MVT::i32;
16958 return MVT::Other;
16959}
16960
16962 const MemOp &Op, const AttributeList &FuncAttributes) const {
16963 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16964 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16965 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16966 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16967 // taken one instruction to materialize the v2i64 zero and one store (with
16968 // restrictive addressing mode). Just do i64 stores.
16969 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16970 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16971 if (Op.isAligned(AlignCheck))
16972 return true;
16973 unsigned Fast;
16974 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16976 Fast;
16977 };
16978
16979 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16980 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16981 return LLT::fixed_vector(2, 64);
16982 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16983 return LLT::scalar(128);
16984 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16985 return LLT::scalar(64);
16986 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16987 return LLT::scalar(32);
16988 return LLT();
16989}
16990
16991// 12-bit optionally shifted immediates are legal for adds.
16993 if (Immed == std::numeric_limits<int64_t>::min()) {
16994 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16995 << ": avoid UB for INT64_MIN\n");
16996 return false;
16997 }
16998 // Same encoding for add/sub, just flip the sign.
16999 Immed = std::abs(Immed);
17000 bool IsLegal = ((Immed >> 12) == 0 ||
17001 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17002 LLVM_DEBUG(dbgs() << "Is " << Immed
17003 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17004 return IsLegal;
17005}
17006
17008 // We will only emit addvl/inc* instructions for SVE2
17009 if (!Subtarget->hasSVE2())
17010 return false;
17011
17012 // addvl's immediates are in terms of the number of bytes in a register.
17013 // Since there are 16 in the base supported size (128bits), we need to
17014 // divide the immediate by that much to give us a useful immediate to
17015 // multiply by vscale. We can't have a remainder as a result of this.
17016 if (Imm % 16 == 0)
17017 return isInt<6>(Imm / 16);
17018
17019 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17020 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17021 // of addvl as a result, so only take h|w|d into account.
17022 // Dec[h|w|d] will cover subtractions.
17023 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17024 // FIXME: Can we make use of other patterns to cover other immediates?
17025
17026 // inch|dech
17027 if (Imm % 8 == 0)
17028 return std::abs(Imm / 8) <= 16;
17029 // incw|decw
17030 if (Imm % 4 == 0)
17031 return std::abs(Imm / 4) <= 16;
17032 // incd|decd
17033 if (Imm % 2 == 0)
17034 return std::abs(Imm / 2) <= 16;
17035
17036 return false;
17037}
17038
17039// Return false to prevent folding
17040// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17041// if the folding leads to worse code.
17043 SDValue AddNode, SDValue ConstNode) const {
17044 // Let the DAGCombiner decide for vector types and large types.
17045 const EVT VT = AddNode.getValueType();
17046 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17047 return true;
17048
17049 // It is worse if c1 is legal add immediate, while c1*c2 is not
17050 // and has to be composed by at least two instructions.
17051 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17052 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17053 const int64_t C1 = C1Node->getSExtValue();
17054 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17056 return true;
17058 // Adapt to the width of a register.
17059 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17061 if (Insn.size() > 1)
17062 return false;
17063
17064 // Default to true and let the DAGCombiner decide.
17065 return true;
17066}
17067
17068// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17069// immediates is the same as for an add or a sub.
17071 return isLegalAddImmediate(Immed);
17072}
17073
17074/// isLegalAddressingMode - Return true if the addressing mode represented
17075/// by AM is legal for this target, for a load/store of the specified type.
17077 const AddrMode &AMode, Type *Ty,
17078 unsigned AS, Instruction *I) const {
17079 // AArch64 has five basic addressing modes:
17080 // reg
17081 // reg + 9-bit signed offset
17082 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17083 // reg1 + reg2
17084 // reg + SIZE_IN_BYTES * reg
17085
17086 // No global is ever allowed as a base.
17087 if (AMode.BaseGV)
17088 return false;
17089
17090 // No reg+reg+imm addressing.
17091 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17092 return false;
17093
17094 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17095 // `2*ScaledReg` into `BaseReg + ScaledReg`
17096 AddrMode AM = AMode;
17097 if (AM.Scale && !AM.HasBaseReg) {
17098 if (AM.Scale == 1) {
17099 AM.HasBaseReg = true;
17100 AM.Scale = 0;
17101 } else if (AM.Scale == 2) {
17102 AM.HasBaseReg = true;
17103 AM.Scale = 1;
17104 } else {
17105 return false;
17106 }
17107 }
17108
17109 // A base register is required in all addressing modes.
17110 if (!AM.HasBaseReg)
17111 return false;
17112
17113 if (Ty->isScalableTy()) {
17114 if (isa<ScalableVectorType>(Ty)) {
17115 // See if we have a foldable vscale-based offset, for vector types which
17116 // are either legal or smaller than the minimum; more work will be
17117 // required if we need to consider addressing for types which need
17118 // legalization by splitting.
17119 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17120 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17121 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17122 isPowerOf2_64(VecNumBytes))
17123 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17124
17125 uint64_t VecElemNumBytes =
17126 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17127 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17128 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17129 }
17130
17131 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17132 }
17133
17134 // No scalable offsets allowed for non-scalable types.
17135 if (AM.ScalableOffset)
17136 return false;
17137
17138 // check reg + imm case:
17139 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17140 uint64_t NumBytes = 0;
17141 if (Ty->isSized()) {
17142 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17143 NumBytes = NumBits / 8;
17144 if (!isPowerOf2_64(NumBits))
17145 NumBytes = 0;
17146 }
17147
17148 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
17149 AM.Scale);
17150}
17151
17152// Check whether the 2 offsets belong to the same imm24 range, and their high
17153// 12bits are same, then their high part can be decoded with the offset of add.
17154int64_t
17156 int64_t MaxOffset) const {
17157 int64_t HighPart = MinOffset & ~0xfffULL;
17158 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
17159 // Rebase the value to an integer multiple of imm12.
17160 return HighPart;
17161 }
17162
17163 return 0;
17164}
17165
17167 // Consider splitting large offset of struct or array.
17168 return true;
17169}
17170
17172 const MachineFunction &MF, EVT VT) const {
17173 VT = VT.getScalarType();
17174
17175 if (!VT.isSimple())
17176 return false;
17177
17178 switch (VT.getSimpleVT().SimpleTy) {
17179 case MVT::f16:
17180 return Subtarget->hasFullFP16();
17181 case MVT::f32:
17182 case MVT::f64:
17183 return true;
17184 default:
17185 break;
17186 }
17187
17188 return false;
17189}
17190
17192 Type *Ty) const {
17193 switch (Ty->getScalarType()->getTypeID()) {
17194 case Type::FloatTyID:
17195 case Type::DoubleTyID:
17196 return true;
17197 default:
17198 return false;
17199 }
17200}
17201
17203 EVT VT, CodeGenOptLevel OptLevel) const {
17204 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17206}
17207
17208const MCPhysReg *
17210 // LR is a callee-save register, but we must treat it as clobbered by any call
17211 // site. Hence we include LR in the scratch registers, which are in turn added
17212 // as implicit-defs for stackmaps and patchpoints.
17213 static const MCPhysReg ScratchRegs[] = {
17214 AArch64::X16, AArch64::X17, AArch64::LR, 0
17215 };
17216 return ScratchRegs;
17217}
17218
17220 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17221 return RCRegs;
17222}
17223
17224bool
17226 CombineLevel Level) const {
17227 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17228 N->getOpcode() == ISD::SRL) &&
17229 "Expected shift op");
17230
17231 SDValue ShiftLHS = N->getOperand(0);
17232 EVT VT = N->getValueType(0);
17233
17234 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17235 // combine it with shift 'N' to let it be lowered to UBFX except:
17236 // ((x >> C) & mask) << C.
17237 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17238 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
17239 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
17240 if (isMask_64(TruncMask)) {
17241 SDValue AndLHS = ShiftLHS.getOperand(0);
17242 if (AndLHS.getOpcode() == ISD::SRL) {
17243 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
17244 if (N->getOpcode() == ISD::SHL)
17245 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
17246 return SRLC->getZExtValue() == SHLC->getZExtValue();
17247 return false;
17248 }
17249 }
17250 }
17251 }
17252 return true;
17253}
17254
17256 const SDNode *N) const {
17257 assert(N->getOpcode() == ISD::XOR &&
17258 (N->getOperand(0).getOpcode() == ISD::SHL ||
17259 N->getOperand(0).getOpcode() == ISD::SRL) &&
17260 "Expected XOR(SHIFT) pattern");
17261
17262 // Only commute if the entire NOT mask is a hidden shifted mask.
17263 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
17264 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17265 if (XorC && ShiftC) {
17266 unsigned MaskIdx, MaskLen;
17267 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17268 unsigned ShiftAmt = ShiftC->getZExtValue();
17269 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17270 if (N->getOperand(0).getOpcode() == ISD::SHL)
17271 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17272 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17273 }
17274 }
17275
17276 return false;
17277}
17278
17280 const SDNode *N, CombineLevel Level) const {
17281 assert(((N->getOpcode() == ISD::SHL &&
17282 N->getOperand(0).getOpcode() == ISD::SRL) ||
17283 (N->getOpcode() == ISD::SRL &&
17284 N->getOperand(0).getOpcode() == ISD::SHL)) &&
17285 "Expected shift-shift mask");
17286 // Don't allow multiuse shift folding with the same shift amount.
17287 if (!N->getOperand(0)->hasOneUse())
17288 return false;
17289
17290 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17291 EVT VT = N->getValueType(0);
17292 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17293 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17294 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17295 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17296 }
17297
17298 return true;
17299}
17300
17302 unsigned BinOpcode, EVT VT) const {
17303 return VT.isScalableVector() && isTypeLegal(VT);
17304}
17305
17307 Type *Ty) const {
17308 assert(Ty->isIntegerTy());
17309
17310 unsigned BitSize = Ty->getPrimitiveSizeInBits();
17311 if (BitSize == 0)
17312 return false;
17313
17314 int64_t Val = Imm.getSExtValue();
17315 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
17316 return true;
17317
17318 if ((int64_t)Val < 0)
17319 Val = ~Val;
17320 if (BitSize == 32)
17321 Val &= (1LL << 32) - 1;
17322
17323 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
17324 // MOVZ is free so return true for one or fewer MOVK.
17325 return Shift < 3;
17326}
17327
17329 unsigned Index) const {
17331 return false;
17332
17333 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
17334}
17335
17336/// Turn vector tests of the signbit in the form of:
17337/// xor (sra X, elt_size(X)-1), -1
17338/// into:
17339/// cmge X, X, #0
17341 const AArch64Subtarget *Subtarget) {
17342 EVT VT = N->getValueType(0);
17343 if (!Subtarget->hasNEON() || !VT.isVector())
17344 return SDValue();
17345
17346 // There must be a shift right algebraic before the xor, and the xor must be a
17347 // 'not' operation.
17348 SDValue Shift = N->getOperand(0);
17349 SDValue Ones = N->getOperand(1);
17350 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
17352 return SDValue();
17353
17354 // The shift should be smearing the sign bit across each vector element.
17355 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
17356 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
17357 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
17358 return SDValue();
17359
17360 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
17361}
17362
17363// Given a vecreduce_add node, detect the below pattern and convert it to the
17364// node sequence with UABDL, [S|U]ADB and UADDLP.
17365//
17366// i32 vecreduce_add(
17367// v16i32 abs(
17368// v16i32 sub(
17369// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
17370// =================>
17371// i32 vecreduce_add(
17372// v4i32 UADDLP(
17373// v8i16 add(
17374// v8i16 zext(
17375// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
17376// v8i16 zext(
17377// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17379 SelectionDAG &DAG) {
17380 // Assumed i32 vecreduce_add
17381 if (N->getValueType(0) != MVT::i32)
17382 return SDValue();
17383
17384 SDValue VecReduceOp0 = N->getOperand(0);
17385 unsigned Opcode = VecReduceOp0.getOpcode();
17386 // Assumed v16i32 abs
17387 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17388 return SDValue();
17389
17390 SDValue ABS = VecReduceOp0;
17391 // Assumed v16i32 sub
17392 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17393 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17394 return SDValue();
17395
17396 SDValue SUB = ABS->getOperand(0);
17397 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17398 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17399 // Assumed v16i32 type
17400 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17401 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17402 return SDValue();
17403
17404 // Assumed zext or sext
17405 bool IsZExt = false;
17406 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17407 IsZExt = true;
17408 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17409 IsZExt = false;
17410 } else
17411 return SDValue();
17412
17413 SDValue EXT0 = SUB->getOperand(0);
17414 SDValue EXT1 = SUB->getOperand(1);
17415 // Assumed zext's operand has v16i8 type
17416 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17417 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17418 return SDValue();
17419
17420 // Pattern is dectected. Let's convert it to sequence of nodes.
17421 SDLoc DL(N);
17422
17423 // First, create the node pattern of UABD/SABD.
17424 SDValue UABDHigh8Op0 =
17425 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17426 DAG.getConstant(8, DL, MVT::i64));
17427 SDValue UABDHigh8Op1 =
17428 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17429 DAG.getConstant(8, DL, MVT::i64));
17430 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17431 UABDHigh8Op0, UABDHigh8Op1);
17432 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17433
17434 // Second, create the node pattern of UABAL.
17435 SDValue UABDLo8Op0 =
17436 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17437 DAG.getConstant(0, DL, MVT::i64));
17438 SDValue UABDLo8Op1 =
17439 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17440 DAG.getConstant(0, DL, MVT::i64));
17441 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17442 UABDLo8Op0, UABDLo8Op1);
17443 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17444 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17445
17446 // Third, create the node of UADDLP.
17447 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17448
17449 // Fourth, create the node of VECREDUCE_ADD.
17450 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17451}
17452
17453// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17454// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17455// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17456// If we have vectors larger than v16i8 we extract v16i8 vectors,
17457// Follow the same steps above to get DOT instructions concatenate them
17458// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17460 const AArch64Subtarget *ST) {
17461 if (!ST->hasDotProd())
17463
17464 SDValue Op0 = N->getOperand(0);
17465 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17466 Op0.getValueType().getVectorElementType() != MVT::i32)
17467 return SDValue();
17468
17469 unsigned ExtOpcode = Op0.getOpcode();
17470 SDValue A = Op0;
17471 SDValue B;
17472 if (ExtOpcode == ISD::MUL) {
17473 A = Op0.getOperand(0);
17474 B = Op0.getOperand(1);
17475 if (A.getOpcode() != B.getOpcode() ||
17476 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17477 return SDValue();
17478 ExtOpcode = A.getOpcode();
17479 }
17480 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17481 return SDValue();
17482
17483 EVT Op0VT = A.getOperand(0).getValueType();
17484 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17485 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17486 if (!IsValidElementCount || !IsValidSize)
17487 return SDValue();
17488
17489 SDLoc DL(Op0);
17490 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17491 // the extend B.
17492 if (!B)
17493 B = DAG.getConstant(1, DL, Op0VT);
17494 else
17495 B = B.getOperand(0);
17496
17497 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17498 unsigned NumOfVecReduce;
17499 EVT TargetType;
17500 if (IsMultipleOf16) {
17501 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17502 TargetType = MVT::v4i32;
17503 } else {
17504 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17505 TargetType = MVT::v2i32;
17506 }
17507 auto DotOpcode =
17509 // Handle the case where we need to generate only one Dot operation.
17510 if (NumOfVecReduce == 1) {
17511 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17512 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17513 A.getOperand(0), B);
17514 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17515 }
17516 // Generate Dot instructions that are multiple of 16.
17517 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17518 SmallVector<SDValue, 4> SDotVec16;
17519 unsigned I = 0;
17520 for (; I < VecReduce16Num; I += 1) {
17521 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17522 SDValue Op0 =
17523 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17524 DAG.getConstant(I * 16, DL, MVT::i64));
17525 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17526 DAG.getConstant(I * 16, DL, MVT::i64));
17527 SDValue Dot =
17528 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17529 SDotVec16.push_back(Dot);
17530 }
17531 // Concatenate dot operations.
17532 EVT SDot16EVT =
17533 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17534 SDValue ConcatSDot16 =
17535 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17536 SDValue VecReduceAdd16 =
17537 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17538 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17539 if (VecReduce8Num == 0)
17540 return VecReduceAdd16;
17541
17542 // Generate the remainder Dot operation that is multiple of 8.
17543 SmallVector<SDValue, 4> SDotVec8;
17544 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17545 SDValue Vec8Op0 =
17546 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17547 DAG.getConstant(I * 16, DL, MVT::i64));
17548 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17549 DAG.getConstant(I * 16, DL, MVT::i64));
17550 SDValue Dot =
17551 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17552 SDValue VecReudceAdd8 =
17553 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17554 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17555 VecReudceAdd8);
17556}
17557
17558// Given an (integer) vecreduce, we know the order of the inputs does not
17559// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17560// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17561// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17563 auto DetectAddExtract = [&](SDValue A) {
17564 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17565 // UADDLP(x) if found.
17566 assert(A.getOpcode() == ISD::ADD);
17567 EVT VT = A.getValueType();
17568 SDValue Op0 = A.getOperand(0);
17569 SDValue Op1 = A.getOperand(1);
17570 if (Op0.getOpcode() != Op0.getOpcode() ||
17571 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17572 Op0.getOpcode() != ISD::SIGN_EXTEND))
17573 return SDValue();
17574 SDValue Ext0 = Op0.getOperand(0);
17575 SDValue Ext1 = Op1.getOperand(0);
17576 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17578 Ext0.getOperand(0) != Ext1.getOperand(0))
17579 return SDValue();
17580 // Check that the type is twice the add types, and the extract are from
17581 // upper/lower parts of the same source.
17583 VT.getVectorNumElements() * 2)
17584 return SDValue();
17585 if ((Ext0.getConstantOperandVal(1) != 0 ||
17587 (Ext1.getConstantOperandVal(1) != 0 ||
17589 return SDValue();
17590 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17592 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17593 };
17594
17595 if (SDValue R = DetectAddExtract(A))
17596 return R;
17597
17598 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17599 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17600 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17601 A.getOperand(1));
17602 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17603 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17604 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17605 A.getOperand(0));
17606 return SDValue();
17607}
17608
17609// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17610// UADDLV(concat), where the concat represents the 64-bit zext sources.
17612 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17613 // UADDLV(concat(zext, zext)) if found.
17614 assert(A.getOpcode() == ISD::ADD);
17615 EVT VT = A.getValueType();
17616 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17617 return SDValue();
17618 SDValue Op0 = A.getOperand(0);
17619 SDValue Op1 = A.getOperand(1);
17620 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17621 return SDValue();
17622 SDValue Ext0 = Op0.getOperand(0);
17623 SDValue Ext1 = Op1.getOperand(0);
17624 EVT ExtVT0 = Ext0.getValueType();
17625 EVT ExtVT1 = Ext1.getValueType();
17626 // Check zext VTs are the same and 64-bit length.
17627 if (ExtVT0 != ExtVT1 ||
17628 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17629 return SDValue();
17630 // Get VT for concat of zext sources.
17631 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17632 SDValue Concat =
17633 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17634
17635 switch (VT.getSimpleVT().SimpleTy) {
17636 case MVT::v2i64:
17637 case MVT::v4i32:
17638 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17639 case MVT::v8i16: {
17640 SDValue Uaddlv =
17641 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17642 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17643 }
17644 default:
17645 llvm_unreachable("Unhandled vector type");
17646 }
17647}
17648
17650 SDValue A = N->getOperand(0);
17651 if (A.getOpcode() == ISD::ADD) {
17652 if (SDValue R = performUADDVAddCombine(A, DAG))
17653 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17654 else if (SDValue R = performUADDVZextCombine(A, DAG))
17655 return R;
17656 }
17657 return SDValue();
17658}
17659
17662 const AArch64Subtarget *Subtarget) {
17663 if (DCI.isBeforeLegalizeOps())
17664 return SDValue();
17665
17666 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17667}
17668
17669SDValue
17670AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17671 SelectionDAG &DAG,
17672 SmallVectorImpl<SDNode *> &Created) const {
17674 if (isIntDivCheap(N->getValueType(0), Attr))
17675 return SDValue(N,0); // Lower SDIV as SDIV
17676
17677 EVT VT = N->getValueType(0);
17678
17679 // For scalable and fixed types, mark them as cheap so we can handle it much
17680 // later. This allows us to handle larger than legal types.
17681 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17682 return SDValue(N, 0);
17683
17684 // fold (sdiv X, pow2)
17685 if ((VT != MVT::i32 && VT != MVT::i64) ||
17686 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17687 return SDValue();
17688
17689 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17690}
17691
17692SDValue
17693AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17694 SelectionDAG &DAG,
17695 SmallVectorImpl<SDNode *> &Created) const {
17697 if (isIntDivCheap(N->getValueType(0), Attr))
17698 return SDValue(N, 0); // Lower SREM as SREM
17699
17700 EVT VT = N->getValueType(0);
17701
17702 // For scalable and fixed types, mark them as cheap so we can handle it much
17703 // later. This allows us to handle larger than legal types.
17704 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17705 return SDValue(N, 0);
17706
17707 // fold (srem X, pow2)
17708 if ((VT != MVT::i32 && VT != MVT::i64) ||
17709 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17710 return SDValue();
17711
17712 unsigned Lg2 = Divisor.countr_zero();
17713 if (Lg2 == 0)
17714 return SDValue();
17715
17716 SDLoc DL(N);
17717 SDValue N0 = N->getOperand(0);
17718 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17719 SDValue Zero = DAG.getConstant(0, DL, VT);
17720 SDValue CCVal, CSNeg;
17721 if (Lg2 == 1) {
17722 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17723 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17724 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17725
17726 Created.push_back(Cmp.getNode());
17727 Created.push_back(And.getNode());
17728 } else {
17729 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17730 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17731
17732 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17733 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17734 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17735 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17736 Negs.getValue(1));
17737
17738 Created.push_back(Negs.getNode());
17739 Created.push_back(AndPos.getNode());
17740 Created.push_back(AndNeg.getNode());
17741 }
17742
17743 return CSNeg;
17744}
17745
17746static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17747 switch(getIntrinsicID(S.getNode())) {
17748 default:
17749 break;
17750 case Intrinsic::aarch64_sve_cntb:
17751 return 8;
17752 case Intrinsic::aarch64_sve_cnth:
17753 return 16;
17754 case Intrinsic::aarch64_sve_cntw:
17755 return 32;
17756 case Intrinsic::aarch64_sve_cntd:
17757 return 64;
17758 }
17759 return {};
17760}
17761
17762/// Calculates what the pre-extend type is, based on the extension
17763/// operation node provided by \p Extend.
17764///
17765/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17766/// pre-extend type is pulled directly from the operand, while other extend
17767/// operations need a bit more inspection to get this information.
17768///
17769/// \param Extend The SDNode from the DAG that represents the extend operation
17770///
17771/// \returns The type representing the \p Extend source type, or \p MVT::Other
17772/// if no valid type can be determined
17774 switch (Extend.getOpcode()) {
17775 case ISD::SIGN_EXTEND:
17776 case ISD::ZERO_EXTEND:
17777 return Extend.getOperand(0).getValueType();
17778 case ISD::AssertSext:
17779 case ISD::AssertZext:
17781 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17782 if (!TypeNode)
17783 return MVT::Other;
17784 return TypeNode->getVT();
17785 }
17786 case ISD::AND: {
17788 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17789 if (!Constant)
17790 return MVT::Other;
17791
17792 uint32_t Mask = Constant->getZExtValue();
17793
17794 if (Mask == UCHAR_MAX)
17795 return MVT::i8;
17796 else if (Mask == USHRT_MAX)
17797 return MVT::i16;
17798 else if (Mask == UINT_MAX)
17799 return MVT::i32;
17800
17801 return MVT::Other;
17802 }
17803 default:
17804 return MVT::Other;
17805 }
17806}
17807
17808/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17809/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17810/// SExt/ZExt rather than the scalar SExt/ZExt
17812 EVT VT = BV.getValueType();
17813 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17815 return SDValue();
17816
17817 // Use the first item in the buildvector/shuffle to get the size of the
17818 // extend, and make sure it looks valid.
17819 SDValue Extend = BV->getOperand(0);
17820 unsigned ExtendOpcode = Extend.getOpcode();
17821 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17822 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17823 ExtendOpcode == ISD::AssertSext;
17824 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17825 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17826 return SDValue();
17827 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17828 // calculatePreExtendType will work without issue.
17829 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17830 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17831 return SDValue();
17832
17833 // Restrict valid pre-extend data type
17834 EVT PreExtendType = calculatePreExtendType(Extend);
17835 if (PreExtendType == MVT::Other ||
17836 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17837 return SDValue();
17838
17839 // Make sure all other operands are equally extended
17840 for (SDValue Op : drop_begin(BV->ops())) {
17841 if (Op.isUndef())
17842 continue;
17843 unsigned Opc = Op.getOpcode();
17844 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17845 Opc == ISD::AssertSext;
17846 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17847 return SDValue();
17848 }
17849
17850 SDValue NBV;
17851 SDLoc DL(BV);
17852 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17853 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17854 EVT PreExtendLegalType =
17855 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17857 for (SDValue Op : BV->ops())
17858 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17859 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17860 PreExtendLegalType));
17861 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17862 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17863 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17864 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17865 BV.getOperand(1).isUndef()
17866 ? DAG.getUNDEF(PreExtendVT)
17867 : BV.getOperand(1).getOperand(0),
17868 cast<ShuffleVectorSDNode>(BV)->getMask());
17869 }
17870 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17871}
17872
17873/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17874/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17876 // If the value type isn't a vector, none of the operands are going to be dups
17877 EVT VT = Mul->getValueType(0);
17878 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17879 return SDValue();
17880
17881 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17882 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17883
17884 // Neither operands have been changed, don't make any further changes
17885 if (!Op0 && !Op1)
17886 return SDValue();
17887
17888 SDLoc DL(Mul);
17889 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17890 Op1 ? Op1 : Mul->getOperand(1));
17891}
17892
17893// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17894// Same for other types with equivalent constants.
17896 EVT VT = N->getValueType(0);
17897 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17898 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17899 return SDValue();
17900 if (N->getOperand(0).getOpcode() != ISD::AND ||
17901 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17902 return SDValue();
17903
17904 SDValue And = N->getOperand(0);
17905 SDValue Srl = And.getOperand(0);
17906
17907 APInt V1, V2, V3;
17908 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17909 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17911 return SDValue();
17912
17913 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17914 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17915 V3 != (HalfSize - 1))
17916 return SDValue();
17917
17918 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17919 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17920 VT.getVectorElementCount() * 2);
17921
17922 SDLoc DL(N);
17923 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17924 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17925 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17926}
17927
17928// Transform vector add(zext i8 to i32, zext i8 to i32)
17929// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
17930// This allows extra uses of saddl/uaddl at the lower vector widths, and less
17931// extends.
17933 EVT VT = N->getValueType(0);
17934 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
17935 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
17936 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
17937 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
17938 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
17939 N->getOperand(0).getOperand(0).getValueType() !=
17940 N->getOperand(1).getOperand(0).getValueType())
17941 return SDValue();
17942
17943 if (N->getOpcode() == ISD::MUL &&
17944 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
17945 return SDValue();
17946
17947 SDValue N0 = N->getOperand(0).getOperand(0);
17948 SDValue N1 = N->getOperand(1).getOperand(0);
17949 EVT InVT = N0.getValueType();
17950
17951 EVT S1 = InVT.getScalarType();
17952 EVT S2 = VT.getScalarType();
17953 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
17954 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
17955 SDLoc DL(N);
17956 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17959 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
17960 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
17961 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
17962 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
17963 : (unsigned)ISD::SIGN_EXTEND,
17964 DL, VT, NewOp);
17965 }
17966 return SDValue();
17967}
17968
17971 const AArch64Subtarget *Subtarget) {
17972
17973 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17974 return Ext;
17976 return Ext;
17977 if (SDValue Ext = performVectorExtCombine(N, DAG))
17978 return Ext;
17979
17980 if (DCI.isBeforeLegalizeOps())
17981 return SDValue();
17982
17983 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17984 // and in MachineCombiner pass, add+mul will be combined into madd.
17985 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17986 SDLoc DL(N);
17987 EVT VT = N->getValueType(0);
17988 SDValue N0 = N->getOperand(0);
17989 SDValue N1 = N->getOperand(1);
17990 SDValue MulOper;
17991 unsigned AddSubOpc;
17992
17993 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17994 AddSubOpc = V->getOpcode();
17995 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17996 SDValue Opnd = V->getOperand(1);
17997 MulOper = V->getOperand(0);
17998 if (AddSubOpc == ISD::SUB)
17999 std::swap(Opnd, MulOper);
18000 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18001 return C->isOne();
18002 }
18003 return false;
18004 };
18005
18006 if (IsAddSubWith1(N0)) {
18007 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18008 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18009 }
18010
18011 if (IsAddSubWith1(N1)) {
18012 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18013 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18014 }
18015
18016 // The below optimizations require a constant RHS.
18017 if (!isa<ConstantSDNode>(N1))
18018 return SDValue();
18019
18020 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18021 const APInt &ConstValue = C->getAPIntValue();
18022
18023 // Allow the scaling to be folded into the `cnt` instruction by preventing
18024 // the scaling to be obscured here. This makes it easier to pattern match.
18025 if (IsSVECntIntrinsic(N0) ||
18026 (N0->getOpcode() == ISD::TRUNCATE &&
18027 (IsSVECntIntrinsic(N0->getOperand(0)))))
18028 if (ConstValue.sge(1) && ConstValue.sle(16))
18029 return SDValue();
18030
18031 // Multiplication of a power of two plus/minus one can be done more
18032 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18033 // future CPUs have a cheaper MADD instruction, this may need to be
18034 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18035 // 64-bit is 5 cycles, so this is always a win.
18036 // More aggressively, some multiplications N0 * C can be lowered to
18037 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18038 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18039 // TODO: lower more cases.
18040
18041 // TrailingZeroes is used to test if the mul can be lowered to
18042 // shift+add+shift.
18043 unsigned TrailingZeroes = ConstValue.countr_zero();
18044 if (TrailingZeroes) {
18045 // Conservatively do not lower to shift+add+shift if the mul might be
18046 // folded into smul or umul.
18047 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18048 isZeroExtended(N0, DAG)))
18049 return SDValue();
18050 // Conservatively do not lower to shift+add+shift if the mul might be
18051 // folded into madd or msub.
18052 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
18053 N->use_begin()->getOpcode() == ISD::SUB))
18054 return SDValue();
18055 }
18056 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18057 // and shift+add+shift.
18058 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18059 unsigned ShiftAmt;
18060
18061 auto Shl = [&](SDValue N0, unsigned N1) {
18062 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18063 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18064 };
18065 auto Add = [&](SDValue N0, SDValue N1) {
18066 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
18067 };
18068 auto Sub = [&](SDValue N0, SDValue N1) {
18069 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
18070 };
18071 auto Negate = [&](SDValue N) {
18072 SDValue Zero = DAG.getConstant(0, DL, VT);
18073 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
18074 };
18075
18076 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18077 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18078 // the (2^N - 1) can't be execused via a single instruction.
18079 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18080 unsigned BitWidth = C.getBitWidth();
18081 for (unsigned i = 1; i < BitWidth / 2; i++) {
18082 APInt Rem;
18083 APInt X(BitWidth, (1 << i) + 1);
18084 APInt::sdivrem(C, X, N, Rem);
18085 APInt NVMinus1 = N - 1;
18086 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18087 M = X;
18088 return true;
18089 }
18090 }
18091 return false;
18092 };
18093
18094 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18095 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18096 // the (2^N - 1) can't be execused via a single instruction.
18097 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18098 APInt CVMinus1 = C - 1;
18099 if (CVMinus1.isNegative())
18100 return false;
18101 unsigned TrailingZeroes = CVMinus1.countr_zero();
18102 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
18103 if (SCVMinus1.isPowerOf2()) {
18104 unsigned BitWidth = SCVMinus1.getBitWidth();
18105 M = APInt(BitWidth, SCVMinus1.logBase2());
18106 N = APInt(BitWidth, TrailingZeroes);
18107 return true;
18108 }
18109 return false;
18110 };
18111
18112 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18113 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18114 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18115 APInt CVMinus1 = C - 1;
18116 if (CVMinus1.isNegative())
18117 return false;
18118 unsigned TrailingZeroes = CVMinus1.countr_zero();
18119 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
18120 if (CVPlus1.isPowerOf2()) {
18121 unsigned BitWidth = CVPlus1.getBitWidth();
18122 M = APInt(BitWidth, CVPlus1.logBase2());
18123 N = APInt(BitWidth, TrailingZeroes);
18124 return true;
18125 }
18126 return false;
18127 };
18128
18129 if (ConstValue.isNonNegative()) {
18130 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18131 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18132 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18133 // (mul x, (2^M + 1) * (2^N + 1))
18134 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18135 // (mul x, (2^M + 1) * 2^N + 1))
18136 // => MV = add (shl x, M), x); add (shl MV, N), x)
18137 // (mul x, 1 - (1 - 2^M) * 2^N))
18138 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18139 APInt SCVMinus1 = ShiftedConstValue - 1;
18140 APInt SCVPlus1 = ShiftedConstValue + 1;
18141 APInt CVPlus1 = ConstValue + 1;
18142 APInt CVM, CVN;
18143 if (SCVMinus1.isPowerOf2()) {
18144 ShiftAmt = SCVMinus1.logBase2();
18145 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18146 } else if (CVPlus1.isPowerOf2()) {
18147 ShiftAmt = CVPlus1.logBase2();
18148 return Sub(Shl(N0, ShiftAmt), N0);
18149 } else if (SCVPlus1.isPowerOf2()) {
18150 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18151 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18152 }
18153 if (Subtarget->hasALULSLFast() &&
18154 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18155 APInt CVMMinus1 = CVM - 1;
18156 APInt CVNMinus1 = CVN - 1;
18157 unsigned ShiftM1 = CVMMinus1.logBase2();
18158 unsigned ShiftN1 = CVNMinus1.logBase2();
18159 // ALULSLFast implicate that Shifts <= 4 places are fast
18160 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18161 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18162 return Add(Shl(MVal, ShiftN1), MVal);
18163 }
18164 }
18165 if (Subtarget->hasALULSLFast() &&
18166 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18167 unsigned ShiftM = CVM.getZExtValue();
18168 unsigned ShiftN = CVN.getZExtValue();
18169 // ALULSLFast implicate that Shifts <= 4 places are fast
18170 if (ShiftM <= 4 && ShiftN <= 4) {
18171 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18172 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18173 }
18174 }
18175
18176 if (Subtarget->hasALULSLFast() &&
18177 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18178 unsigned ShiftM = CVM.getZExtValue();
18179 unsigned ShiftN = CVN.getZExtValue();
18180 // ALULSLFast implicate that Shifts <= 4 places are fast
18181 if (ShiftM <= 4 && ShiftN <= 4) {
18182 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18183 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18184 }
18185 }
18186 } else {
18187 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18188 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18189 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18190 APInt SCVPlus1 = -ShiftedConstValue + 1;
18191 APInt CVNegPlus1 = -ConstValue + 1;
18192 APInt CVNegMinus1 = -ConstValue - 1;
18193 if (CVNegPlus1.isPowerOf2()) {
18194 ShiftAmt = CVNegPlus1.logBase2();
18195 return Sub(N0, Shl(N0, ShiftAmt));
18196 } else if (CVNegMinus1.isPowerOf2()) {
18197 ShiftAmt = CVNegMinus1.logBase2();
18198 return Negate(Add(Shl(N0, ShiftAmt), N0));
18199 } else if (SCVPlus1.isPowerOf2()) {
18200 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18201 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18202 }
18203 }
18204
18205 return SDValue();
18206}
18207
18209 SelectionDAG &DAG) {
18210 // Take advantage of vector comparisons producing 0 or -1 in each lane to
18211 // optimize away operation when it's from a constant.
18212 //
18213 // The general transformation is:
18214 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18215 // AND(VECTOR_CMP(x,y), constant2)
18216 // constant2 = UNARYOP(constant)
18217
18218 // Early exit if this isn't a vector operation, the operand of the
18219 // unary operation isn't a bitwise AND, or if the sizes of the operations
18220 // aren't the same.
18221 EVT VT = N->getValueType(0);
18222 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
18223 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
18224 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
18225 return SDValue();
18226
18227 // Now check that the other operand of the AND is a constant. We could
18228 // make the transformation for non-constant splats as well, but it's unclear
18229 // that would be a benefit as it would not eliminate any operations, just
18230 // perform one more step in scalar code before moving to the vector unit.
18231 if (BuildVectorSDNode *BV =
18232 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
18233 // Bail out if the vector isn't a constant.
18234 if (!BV->isConstant())
18235 return SDValue();
18236
18237 // Everything checks out. Build up the new and improved node.
18238 SDLoc DL(N);
18239 EVT IntVT = BV->getValueType(0);
18240 // Create a new constant of the appropriate type for the transformed
18241 // DAG.
18242 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
18243 // The AND node needs bitcasts to/from an integer vector type around it.
18244 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
18245 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
18246 N->getOperand(0)->getOperand(0), MaskConst);
18247 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
18248 return Res;
18249 }
18250
18251 return SDValue();
18252}
18253
18255 const AArch64Subtarget *Subtarget) {
18256 // First try to optimize away the conversion when it's conditionally from
18257 // a constant. Vectors only.
18259 return Res;
18260
18261 EVT VT = N->getValueType(0);
18262 if (VT != MVT::f32 && VT != MVT::f64)
18263 return SDValue();
18264
18265 // Only optimize when the source and destination types have the same width.
18266 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
18267 return SDValue();
18268
18269 // If the result of an integer load is only used by an integer-to-float
18270 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
18271 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
18272 SDValue N0 = N->getOperand(0);
18273 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
18274 N0.hasOneUse() &&
18275 // Do not change the width of a volatile load.
18276 !cast<LoadSDNode>(N0)->isVolatile()) {
18277 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
18278 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
18279 LN0->getPointerInfo(), LN0->getAlign(),
18280 LN0->getMemOperand()->getFlags());
18281
18282 // Make sure successors of the original load stay after it by updating them
18283 // to use the new Chain.
18284 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
18285
18286 unsigned Opcode =
18288 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
18289 }
18290
18291 return SDValue();
18292}
18293
18294/// Fold a floating-point multiply by power of two into floating-point to
18295/// fixed-point conversion.
18298 const AArch64Subtarget *Subtarget) {
18299 if (!Subtarget->isNeonAvailable())
18300 return SDValue();
18301
18302 if (!N->getValueType(0).isSimple())
18303 return SDValue();
18304
18305 SDValue Op = N->getOperand(0);
18306 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
18307 return SDValue();
18308
18309 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
18310 return SDValue();
18311
18312 SDValue ConstVec = Op->getOperand(1);
18313 if (!isa<BuildVectorSDNode>(ConstVec))
18314 return SDValue();
18315
18316 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
18317 uint32_t FloatBits = FloatTy.getSizeInBits();
18318 if (FloatBits != 32 && FloatBits != 64 &&
18319 (FloatBits != 16 || !Subtarget->hasFullFP16()))
18320 return SDValue();
18321
18322 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
18323 uint32_t IntBits = IntTy.getSizeInBits();
18324 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
18325 return SDValue();
18326
18327 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
18328 if (IntBits > FloatBits)
18329 return SDValue();
18330
18331 BitVector UndefElements;
18332 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
18333 int32_t Bits = IntBits == 64 ? 64 : 32;
18334 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
18335 if (C == -1 || C == 0 || C > Bits)
18336 return SDValue();
18337
18338 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
18339 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
18340 return SDValue();
18341
18342 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
18343 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
18344 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
18345 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
18346 return SDValue();
18347 }
18348
18349 SDLoc DL(N);
18350 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
18351 N->getOpcode() == ISD::FP_TO_SINT_SAT);
18352 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
18353 : Intrinsic::aarch64_neon_vcvtfp2fxu;
18354 SDValue FixConv =
18356 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
18357 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
18358 // We can handle smaller integers by generating an extra trunc.
18359 if (IntBits < FloatBits)
18360 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
18361
18362 return FixConv;
18363}
18364
18366 const AArch64TargetLowering &TLI) {
18367 EVT VT = N->getValueType(0);
18368 SelectionDAG &DAG = DCI.DAG;
18369 SDLoc DL(N);
18370 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18371
18372 if (!VT.isVector())
18373 return SDValue();
18374
18375 if (VT.isScalableVector() && !Subtarget.hasSVE2())
18376 return SDValue();
18377
18378 if (VT.isFixedLengthVector() &&
18379 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
18380 return SDValue();
18381
18382 SDValue N0 = N->getOperand(0);
18383 if (N0.getOpcode() != ISD::AND)
18384 return SDValue();
18385
18386 SDValue N1 = N->getOperand(1);
18387 if (N1.getOpcode() != ISD::AND)
18388 return SDValue();
18389
18390 // InstCombine does (not (neg a)) => (add a -1).
18391 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18392 // Loop over all combinations of AND operands.
18393 for (int i = 1; i >= 0; --i) {
18394 for (int j = 1; j >= 0; --j) {
18395 SDValue O0 = N0->getOperand(i);
18396 SDValue O1 = N1->getOperand(j);
18397 SDValue Sub, Add, SubSibling, AddSibling;
18398
18399 // Find a SUB and an ADD operand, one from each AND.
18400 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18401 Sub = O0;
18402 Add = O1;
18403 SubSibling = N0->getOperand(1 - i);
18404 AddSibling = N1->getOperand(1 - j);
18405 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18406 Add = O0;
18407 Sub = O1;
18408 AddSibling = N0->getOperand(1 - i);
18409 SubSibling = N1->getOperand(1 - j);
18410 } else
18411 continue;
18412
18414 continue;
18415
18416 // Constant ones is always righthand operand of the Add.
18417 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18418 continue;
18419
18420 if (Sub.getOperand(1) != Add.getOperand(0))
18421 continue;
18422
18423 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18424 }
18425 }
18426
18427 // (or (and a b) (and (not a) c)) => (bsl a b c)
18428 // We only have to look for constant vectors here since the general, variable
18429 // case can be handled in TableGen.
18430 unsigned Bits = VT.getScalarSizeInBits();
18431 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18432 for (int i = 1; i >= 0; --i)
18433 for (int j = 1; j >= 0; --j) {
18434 APInt Val1, Val2;
18435
18436 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18438 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18439 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18440 N0->getOperand(1 - i), N1->getOperand(1 - j));
18441 }
18442 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18443 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18444 if (!BVN0 || !BVN1)
18445 continue;
18446
18447 bool FoundMatch = true;
18448 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18449 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18450 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18451 if (!CN0 || !CN1 ||
18452 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18453 FoundMatch = false;
18454 break;
18455 }
18456 }
18457 if (FoundMatch)
18458 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18459 N0->getOperand(1 - i), N1->getOperand(1 - j));
18460 }
18461
18462 return SDValue();
18463}
18464
18465// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18466// convert to csel(ccmp(.., cc0)), depending on cc1:
18467
18468// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18469// =>
18470// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18471//
18472// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18473// =>
18474// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18476 EVT VT = N->getValueType(0);
18477 SDValue CSel0 = N->getOperand(0);
18478 SDValue CSel1 = N->getOperand(1);
18479
18480 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18481 CSel1.getOpcode() != AArch64ISD::CSEL)
18482 return SDValue();
18483
18484 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18485 return SDValue();
18486
18487 if (!isNullConstant(CSel0.getOperand(0)) ||
18488 !isOneConstant(CSel0.getOperand(1)) ||
18489 !isNullConstant(CSel1.getOperand(0)) ||
18490 !isOneConstant(CSel1.getOperand(1)))
18491 return SDValue();
18492
18493 SDValue Cmp0 = CSel0.getOperand(3);
18494 SDValue Cmp1 = CSel1.getOperand(3);
18497 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18498 return SDValue();
18499 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18500 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18501 std::swap(Cmp0, Cmp1);
18502 std::swap(CC0, CC1);
18503 }
18504
18505 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18506 return SDValue();
18507
18508 SDLoc DL(N);
18509 SDValue CCmp, Condition;
18510 unsigned NZCV;
18511
18512 if (N->getOpcode() == ISD::AND) {
18514 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18516 } else {
18518 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18520 }
18521
18522 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18523
18524 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18525 if (Op1 && Op1->getAPIntValue().isNegative() &&
18526 Op1->getAPIntValue().sgt(-32)) {
18527 // CCMP accept the constant int the range [0, 31]
18528 // if the Op1 is a constant in the range [-31, -1], we
18529 // can select to CCMN to avoid the extra mov
18530 SDValue AbsOp1 =
18531 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18532 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18533 NZCVOp, Condition, Cmp0);
18534 } else {
18535 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18536 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18537 }
18538 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18539 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18540 CCmp);
18541}
18542
18544 const AArch64Subtarget *Subtarget,
18545 const AArch64TargetLowering &TLI) {
18546 SelectionDAG &DAG = DCI.DAG;
18547 EVT VT = N->getValueType(0);
18548
18549 if (SDValue R = performANDORCSELCombine(N, DAG))
18550 return R;
18551
18552 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18553 return SDValue();
18554
18555 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18556 return Res;
18557
18558 return SDValue();
18559}
18560
18562 if (!MemVT.getVectorElementType().isSimple())
18563 return false;
18564
18565 uint64_t MaskForTy = 0ull;
18566 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18567 case MVT::i8:
18568 MaskForTy = 0xffull;
18569 break;
18570 case MVT::i16:
18571 MaskForTy = 0xffffull;
18572 break;
18573 case MVT::i32:
18574 MaskForTy = 0xffffffffull;
18575 break;
18576 default:
18577 return false;
18578 break;
18579 }
18580
18581 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18582 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18583 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18584
18585 return false;
18586}
18587
18589 SDValue LeafOp = SDValue(N, 0);
18590 SDValue Op = N->getOperand(0);
18591 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18592 LeafOp.getValueType() != Op.getValueType())
18593 Op = Op->getOperand(0);
18594 if (LeafOp.getValueType() == Op.getValueType())
18595 return Op;
18596 return SDValue();
18597}
18598
18601 SelectionDAG &DAG = DCI.DAG;
18602 SDValue Src = N->getOperand(0);
18603 unsigned Opc = Src->getOpcode();
18604
18605 // Zero/any extend of an unsigned unpack
18606 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18607 SDValue UnpkOp = Src->getOperand(0);
18608 SDValue Dup = N->getOperand(1);
18609
18610 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18611 return SDValue();
18612
18613 SDLoc DL(N);
18614 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18615 if (!C)
18616 return SDValue();
18617
18618 uint64_t ExtVal = C->getZExtValue();
18619
18620 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18621 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18622 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18623 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18624 };
18625
18626 // If the mask is fully covered by the unpack, we don't need to push
18627 // a new AND onto the operand
18628 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18629 if (MaskAndTypeMatch(EltTy))
18630 return Src;
18631
18632 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18633 // to see if the mask is all-ones of size MemTy.
18634 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18635 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18636 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18637 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18638 if (MaskAndTypeMatch(EltTy))
18639 return Src;
18640 }
18641
18642 // Truncate to prevent a DUP with an over wide constant
18643 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18644
18645 // Otherwise, make sure we propagate the AND to the operand
18646 // of the unpack
18647 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18648 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18649
18650 SDValue And = DAG.getNode(ISD::AND, DL,
18651 UnpkOp->getValueType(0), UnpkOp, Dup);
18652
18653 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18654 }
18655
18656 if (DCI.isBeforeLegalizeOps())
18657 return SDValue();
18658
18659 // If both sides of AND operations are i1 splat_vectors then
18660 // we can produce just i1 splat_vector as the result.
18661 if (isAllActivePredicate(DAG, N->getOperand(0)))
18662 return N->getOperand(1);
18663 if (isAllActivePredicate(DAG, N->getOperand(1)))
18664 return N->getOperand(0);
18665
18667 return SDValue();
18668
18669 SDValue Mask = N->getOperand(1);
18670
18671 if (!Src.hasOneUse())
18672 return SDValue();
18673
18674 EVT MemVT;
18675
18676 // SVE load instructions perform an implicit zero-extend, which makes them
18677 // perfect candidates for combining.
18678 switch (Opc) {
18682 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18683 break;
18699 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18700 break;
18701 default:
18702 return SDValue();
18703 }
18704
18705 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18706 return Src;
18707
18708 return SDValue();
18709}
18710
18711// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18714
18715 // This function performs an optimization on a specific pattern involving
18716 // an AND operation and SETCC (Set Condition Code) node.
18717
18718 SDValue SetCC = N->getOperand(0);
18719 EVT VT = N->getValueType(0);
18720 SelectionDAG &DAG = DCI.DAG;
18721
18722 // Checks if the current node (N) is used by any SELECT instruction and
18723 // returns an empty SDValue to avoid applying the optimization to prevent
18724 // incorrect results
18725 for (auto U : N->uses())
18726 if (U->getOpcode() == ISD::SELECT)
18727 return SDValue();
18728
18729 // Check if the operand is a SETCC node with floating-point comparison
18730 if (SetCC.getOpcode() == ISD::SETCC &&
18731 SetCC.getOperand(0).getValueType() == MVT::f32) {
18732
18733 SDValue Cmp;
18735
18736 // Check if the DAG is after legalization and if we can emit the conjunction
18737 if (!DCI.isBeforeLegalize() &&
18738 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18739
18741
18742 SDLoc DL(N);
18743 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18744 DAG.getConstant(0, DL, VT),
18745 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18746 }
18747 }
18748 return SDValue();
18749}
18750
18753 SelectionDAG &DAG = DCI.DAG;
18754 SDValue LHS = N->getOperand(0);
18755 SDValue RHS = N->getOperand(1);
18756 EVT VT = N->getValueType(0);
18757
18758 if (SDValue R = performANDORCSELCombine(N, DAG))
18759 return R;
18760
18761 if (SDValue R = performANDSETCCCombine(N,DCI))
18762 return R;
18763
18764 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18765 return SDValue();
18766
18767 if (VT.isScalableVector())
18768 return performSVEAndCombine(N, DCI);
18769
18770 // The combining code below works only for NEON vectors. In particular, it
18771 // does not work for SVE when dealing with vectors wider than 128 bits.
18772 if (!VT.is64BitVector() && !VT.is128BitVector())
18773 return SDValue();
18774
18775 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18776 if (!BVN)
18777 return SDValue();
18778
18779 // AND does not accept an immediate, so check if we can use a BIC immediate
18780 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18781 // pattern in isel, because some immediates may be lowered to the preferred
18782 // (and x, (movi imm)) form, even though an mvni representation also exists.
18783 APInt DefBits(VT.getSizeInBits(), 0);
18784 APInt UndefBits(VT.getSizeInBits(), 0);
18785 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18786 SDValue NewOp;
18787
18788 // Any bits known to already be 0 need not be cleared again, which can help
18789 // reduce the size of the immediate to one supported by the instruction.
18790 KnownBits Known = DAG.computeKnownBits(LHS);
18791 APInt ZeroSplat(VT.getSizeInBits(), 0);
18792 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18793 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18794 << (Known.Zero.getBitWidth() * I);
18795
18796 DefBits = ~(DefBits | ZeroSplat);
18797 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18798 DefBits, &LHS)) ||
18799 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18800 DefBits, &LHS)))
18801 return NewOp;
18802
18803 UndefBits = ~(UndefBits | ZeroSplat);
18804 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18805 UndefBits, &LHS)) ||
18806 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18807 UndefBits, &LHS)))
18808 return NewOp;
18809 }
18810
18811 return SDValue();
18812}
18813
18816 SelectionDAG &DAG = DCI.DAG;
18817 SDValue LHS = N->getOperand(0);
18818 SDValue RHS = N->getOperand(1);
18819 EVT VT = N->getValueType(0);
18820 SDLoc DL(N);
18821
18822 if (!N->getFlags().hasAllowReassociation())
18823 return SDValue();
18824
18825 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18826 auto ReassocComplex = [&](SDValue A, SDValue B) {
18827 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18828 return SDValue();
18829 unsigned Opc = A.getConstantOperandVal(0);
18830 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18831 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18832 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18833 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18834 return SDValue();
18835 SDValue VCMLA = DAG.getNode(
18836 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18837 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18838 A.getOperand(2), A.getOperand(3));
18839 VCMLA->setFlags(A->getFlags());
18840 return VCMLA;
18841 };
18842 if (SDValue R = ReassocComplex(LHS, RHS))
18843 return R;
18844 if (SDValue R = ReassocComplex(RHS, LHS))
18845 return R;
18846
18847 return SDValue();
18848}
18849
18850static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18851 switch (Opcode) {
18852 case ISD::STRICT_FADD:
18853 case ISD::FADD:
18854 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18855 case ISD::ADD:
18856 return VT == MVT::i64;
18857 default:
18858 return false;
18859 }
18860}
18861
18862static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18864
18866 if ((N.getOpcode() == ISD::SETCC) ||
18867 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18868 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18869 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18870 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18871 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18872 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18873 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18874 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18875 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18876 // get_active_lane_mask is lowered to a whilelo instruction.
18877 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18878 return true;
18879
18880 return false;
18881}
18882
18883// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18884// ... into: "ptrue p, all" + PTEST
18885static SDValue
18888 const AArch64Subtarget *Subtarget) {
18889 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18890 // Make sure PTEST can be legalised with illegal types.
18891 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18892 return SDValue();
18893
18894 SDValue N0 = N->getOperand(0);
18895 EVT VT = N0.getValueType();
18896
18897 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18898 !isNullConstant(N->getOperand(1)))
18899 return SDValue();
18900
18901 // Restricted the DAG combine to only cases where we're extracting from a
18902 // flag-setting operation.
18903 if (!isPredicateCCSettingOp(N0))
18904 return SDValue();
18905
18906 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18907 SelectionDAG &DAG = DCI.DAG;
18908 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18909 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18910}
18911
18912// Materialize : Idx = (add (mul vscale, NumEls), -1)
18913// i1 = extract_vector_elt t37, Constant:i64<Idx>
18914// ... into: "ptrue p, all" + PTEST
18915static SDValue
18918 const AArch64Subtarget *Subtarget) {
18919 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18920 // Make sure PTEST is legal types.
18921 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18922 return SDValue();
18923
18924 SDValue N0 = N->getOperand(0);
18925 EVT OpVT = N0.getValueType();
18926
18927 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18928 return SDValue();
18929
18930 // Idx == (add (mul vscale, NumEls), -1)
18931 SDValue Idx = N->getOperand(1);
18932 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18933 return SDValue();
18934
18935 SDValue VS = Idx.getOperand(0);
18936 if (VS.getOpcode() != ISD::VSCALE)
18937 return SDValue();
18938
18939 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18940 if (VS.getConstantOperandVal(0) != NumEls)
18941 return SDValue();
18942
18943 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18944 SelectionDAG &DAG = DCI.DAG;
18945 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18946 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18947}
18948
18949static SDValue
18951 const AArch64Subtarget *Subtarget) {
18952 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18953 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18954 return Res;
18955 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18956 return Res;
18957
18958 SelectionDAG &DAG = DCI.DAG;
18959 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18960
18961 EVT VT = N->getValueType(0);
18962 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18963 bool IsStrict = N0->isStrictFPOpcode();
18964
18965 // extract(dup x) -> x
18966 if (N0.getOpcode() == AArch64ISD::DUP)
18967 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18968 : N0.getOperand(0);
18969
18970 // Rewrite for pairwise fadd pattern
18971 // (f32 (extract_vector_elt
18972 // (fadd (vXf32 Other)
18973 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18974 // ->
18975 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18976 // (extract_vector_elt (vXf32 Other) 1))
18977 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18978 // we can only do this when it's used only by the extract_vector_elt.
18979 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18980 (!IsStrict || N0.hasOneUse())) {
18981 SDLoc DL(N0);
18982 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18983 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18984
18985 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18986 SDValue Other = N00;
18987
18988 // And handle the commutative case.
18989 if (!Shuffle) {
18990 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18991 Other = N01;
18992 }
18993
18994 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18995 Other == Shuffle->getOperand(0)) {
18996 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18997 DAG.getConstant(0, DL, MVT::i64));
18998 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18999 DAG.getConstant(1, DL, MVT::i64));
19000 if (!IsStrict)
19001 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19002
19003 // For strict_fadd we need uses of the final extract_vector to be replaced
19004 // with the strict_fadd, but we also need uses of the chain output of the
19005 // original strict_fadd to use the chain output of the new strict_fadd as
19006 // otherwise it may not be deleted.
19007 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
19008 {VT, MVT::Other},
19009 {N0->getOperand(0), Extract1, Extract2});
19010 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
19011 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
19012 return SDValue(N, 0);
19013 }
19014 }
19015
19016 return SDValue();
19017}
19018
19021 SelectionDAG &DAG) {
19022 SDLoc dl(N);
19023 EVT VT = N->getValueType(0);
19024 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19025 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19026
19027 if (VT.isScalableVector())
19028 return SDValue();
19029
19030 // Optimize concat_vectors of truncated vectors, where the intermediate
19031 // type is illegal, to avoid said illegality, e.g.,
19032 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19033 // (v2i16 (truncate (v2i64)))))
19034 // ->
19035 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19036 // (v4i32 (bitcast (v2i64))),
19037 // <0, 2, 4, 6>)))
19038 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19039 // on both input and result type, so we might generate worse code.
19040 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19041 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19042 N1Opc == ISD::TRUNCATE) {
19043 SDValue N00 = N0->getOperand(0);
19044 SDValue N10 = N1->getOperand(0);
19045 EVT N00VT = N00.getValueType();
19046
19047 if (N00VT == N10.getValueType() &&
19048 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19049 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19050 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19052 for (size_t i = 0; i < Mask.size(); ++i)
19053 Mask[i] = i * 2;
19054 return DAG.getNode(ISD::TRUNCATE, dl, VT,
19055 DAG.getVectorShuffle(
19056 MidVT, dl,
19057 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
19058 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
19059 }
19060 }
19061
19062 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
19063 N->getOperand(0).getValueType() == MVT::v2i16 ||
19064 N->getOperand(0).getValueType() == MVT::v2i8) {
19065 EVT SrcVT = N->getOperand(0).getValueType();
19066 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19067 // loads to prevent having to go through the v4i8 load legalization that
19068 // needs to extend each element into a larger type.
19069 if (N->getNumOperands() % 2 == 0 &&
19070 all_of(N->op_values(), [SrcVT](SDValue V) {
19071 if (V.getValueType() != SrcVT)
19072 return false;
19073 if (V.isUndef())
19074 return true;
19075 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
19076 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19077 LD->getExtensionType() == ISD::NON_EXTLOAD;
19078 })) {
19079 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19080 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
19082
19083 for (unsigned i = 0; i < N->getNumOperands(); i++) {
19084 SDValue V = N->getOperand(i);
19085 if (V.isUndef())
19086 Ops.push_back(DAG.getUNDEF(FVT));
19087 else {
19088 LoadSDNode *LD = cast<LoadSDNode>(V);
19089 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
19090 LD->getBasePtr(), LD->getMemOperand());
19091 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
19092 Ops.push_back(NewLoad);
19093 }
19094 }
19095 return DAG.getBitcast(N->getValueType(0),
19096 DAG.getBuildVector(NVT, dl, Ops));
19097 }
19098 }
19099
19100 // Canonicalise concat_vectors to replace concatenations of truncated nots
19101 // with nots of concatenated truncates. This in some cases allows for multiple
19102 // redundant negations to be eliminated.
19103 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
19104 // (v4i16 (truncate (not (v4i32)))))
19105 // ->
19106 // (not (concat_vectors (v4i16 (truncate (v4i32))),
19107 // (v4i16 (truncate (v4i32)))))
19108 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19109 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
19110 N->isOnlyUserOf(N1.getNode())) {
19111 auto isBitwiseVectorNegate = [](SDValue V) {
19112 return V->getOpcode() == ISD::XOR &&
19113 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
19114 };
19115 SDValue N00 = N0->getOperand(0);
19116 SDValue N10 = N1->getOperand(0);
19117 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
19118 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
19119 return DAG.getNOT(
19120 dl,
19121 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19122 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
19123 N00->getOperand(0)),
19124 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
19125 N10->getOperand(0))),
19126 VT);
19127 }
19128 }
19129
19130 // Wait till after everything is legalized to try this. That way we have
19131 // legal vector types and such.
19132 if (DCI.isBeforeLegalizeOps())
19133 return SDValue();
19134
19135 // Optimise concat_vectors of two identical binops with a 128-bit destination
19136 // size, combine into an binop of two contacts of the source vectors. eg:
19137 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19138 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
19139 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
19140 N1->hasOneUse()) {
19141 SDValue N00 = N0->getOperand(0);
19142 SDValue N01 = N0->getOperand(1);
19143 SDValue N10 = N1->getOperand(0);
19144 SDValue N11 = N1->getOperand(1);
19145
19146 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19147 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
19148 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
19149 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
19150 }
19151 }
19152
19153 auto IsRSHRN = [](SDValue Shr) {
19154 if (Shr.getOpcode() != AArch64ISD::VLSHR)
19155 return false;
19156 SDValue Op = Shr.getOperand(0);
19157 EVT VT = Op.getValueType();
19158 unsigned ShtAmt = Shr.getConstantOperandVal(1);
19159 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
19160 return false;
19161
19162 APInt Imm;
19163 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
19164 Imm = APInt(VT.getScalarSizeInBits(),
19165 Op.getOperand(1).getConstantOperandVal(0)
19166 << Op.getOperand(1).getConstantOperandVal(1));
19167 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
19168 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
19169 Imm = APInt(VT.getScalarSizeInBits(),
19170 Op.getOperand(1).getConstantOperandVal(0));
19171 else
19172 return false;
19173
19174 if (Imm != 1ULL << (ShtAmt - 1))
19175 return false;
19176 return true;
19177 };
19178
19179 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
19180 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
19181 ((IsRSHRN(N1) &&
19183 N1.isUndef())) {
19184 SDValue X = N0.getOperand(0).getOperand(0);
19185 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
19186 : N1.getOperand(0).getOperand(0);
19187 EVT BVT =
19188 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
19189 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
19190 SDValue Add = DAG.getNode(
19191 ISD::ADD, dl, BVT, CC,
19192 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
19193 SDValue Shr =
19194 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
19195 return Shr;
19196 }
19197
19198 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
19199 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
19200 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
19201 N0.getOperand(1) == N1.getOperand(1)) {
19202 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
19203 DAG.getUNDEF(N0.getValueType()));
19204 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
19205 DAG.getUNDEF(N0.getValueType()));
19206 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
19207 }
19208
19209 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
19210 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
19211 // canonicalise to that.
19212 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
19213 assert(VT.getScalarSizeInBits() == 64);
19214 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
19215 DAG.getConstant(0, dl, MVT::i64));
19216 }
19217
19218 // Canonicalise concat_vectors so that the right-hand vector has as few
19219 // bit-casts as possible before its real operation. The primary matching
19220 // destination for these operations will be the narrowing "2" instructions,
19221 // which depend on the operation being performed on this right-hand vector.
19222 // For example,
19223 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
19224 // becomes
19225 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
19226
19227 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
19228 return SDValue();
19229 SDValue RHS = N1->getOperand(0);
19230 MVT RHSTy = RHS.getValueType().getSimpleVT();
19231 // If the RHS is not a vector, this is not the pattern we're looking for.
19232 if (!RHSTy.isVector())
19233 return SDValue();
19234
19235 LLVM_DEBUG(
19236 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
19237
19238 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
19239 RHSTy.getVectorNumElements() * 2);
19240 return DAG.getNode(ISD::BITCAST, dl, VT,
19241 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
19242 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
19243 RHS));
19244}
19245
19246static SDValue
19248 SelectionDAG &DAG) {
19249 if (DCI.isBeforeLegalizeOps())
19250 return SDValue();
19251
19252 EVT VT = N->getValueType(0);
19253 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
19254 return SDValue();
19255
19256 SDValue V = N->getOperand(0);
19257
19258 // NOTE: This combine exists in DAGCombiner, but that version's legality check
19259 // blocks this combine because the non-const case requires custom lowering.
19260 //
19261 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
19262 if (V.getOpcode() == ISD::SPLAT_VECTOR)
19263 if (isa<ConstantSDNode>(V.getOperand(0)))
19264 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
19265
19266 return SDValue();
19267}
19268
19269static SDValue
19271 SelectionDAG &DAG) {
19272 SDLoc DL(N);
19273 SDValue Vec = N->getOperand(0);
19274 SDValue SubVec = N->getOperand(1);
19275 uint64_t IdxVal = N->getConstantOperandVal(2);
19276 EVT VecVT = Vec.getValueType();
19277 EVT SubVT = SubVec.getValueType();
19278
19279 // Only do this for legal fixed vector types.
19280 if (!VecVT.isFixedLengthVector() ||
19281 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
19282 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
19283 return SDValue();
19284
19285 // Ignore widening patterns.
19286 if (IdxVal == 0 && Vec.isUndef())
19287 return SDValue();
19288
19289 // Subvector must be half the width and an "aligned" insertion.
19290 unsigned NumSubElts = SubVT.getVectorNumElements();
19291 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
19292 (IdxVal != 0 && IdxVal != NumSubElts))
19293 return SDValue();
19294
19295 // Fold insert_subvector -> concat_vectors
19296 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
19297 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
19298 SDValue Lo, Hi;
19299 if (IdxVal == 0) {
19300 Lo = SubVec;
19301 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
19302 DAG.getVectorIdxConstant(NumSubElts, DL));
19303 } else {
19304 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
19305 DAG.getVectorIdxConstant(0, DL));
19306 Hi = SubVec;
19307 }
19308 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
19309}
19310
19313 SelectionDAG &DAG) {
19314 // Wait until after everything is legalized to try this. That way we have
19315 // legal vector types and such.
19316 if (DCI.isBeforeLegalizeOps())
19317 return SDValue();
19318 // Transform a scalar conversion of a value from a lane extract into a
19319 // lane extract of a vector conversion. E.g., from foo1 to foo2:
19320 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
19321 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
19322 //
19323 // The second form interacts better with instruction selection and the
19324 // register allocator to avoid cross-class register copies that aren't
19325 // coalescable due to a lane reference.
19326
19327 // Check the operand and see if it originates from a lane extract.
19328 SDValue Op1 = N->getOperand(1);
19330 return SDValue();
19331
19332 // Yep, no additional predication needed. Perform the transform.
19333 SDValue IID = N->getOperand(0);
19334 SDValue Shift = N->getOperand(2);
19335 SDValue Vec = Op1.getOperand(0);
19336 SDValue Lane = Op1.getOperand(1);
19337 EVT ResTy = N->getValueType(0);
19338 EVT VecResTy;
19339 SDLoc DL(N);
19340
19341 // The vector width should be 128 bits by the time we get here, even
19342 // if it started as 64 bits (the extract_vector handling will have
19343 // done so). Bail if it is not.
19344 if (Vec.getValueSizeInBits() != 128)
19345 return SDValue();
19346
19347 if (Vec.getValueType() == MVT::v4i32)
19348 VecResTy = MVT::v4f32;
19349 else if (Vec.getValueType() == MVT::v2i64)
19350 VecResTy = MVT::v2f64;
19351 else
19352 return SDValue();
19353
19354 SDValue Convert =
19355 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
19356 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
19357}
19358
19359// AArch64 high-vector "long" operations are formed by performing the non-high
19360// version on an extract_subvector of each operand which gets the high half:
19361//
19362// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
19363//
19364// However, there are cases which don't have an extract_high explicitly, but
19365// have another operation that can be made compatible with one for free. For
19366// example:
19367//
19368// (dupv64 scalar) --> (extract_high (dup128 scalar))
19369//
19370// This routine does the actual conversion of such DUPs, once outer routines
19371// have determined that everything else is in order.
19372// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
19373// similarly here.
19375 MVT VT = N.getSimpleValueType();
19376 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
19377 N.getConstantOperandVal(1) == 0)
19378 N = N.getOperand(0);
19379
19380 switch (N.getOpcode()) {
19381 case AArch64ISD::DUP:
19386 case AArch64ISD::MOVI:
19392 break;
19393 default:
19394 // FMOV could be supported, but isn't very useful, as it would only occur
19395 // if you passed a bitcast' floating point immediate to an eligible long
19396 // integer op (addl, smull, ...).
19397 return SDValue();
19398 }
19399
19400 if (!VT.is64BitVector())
19401 return SDValue();
19402
19403 SDLoc DL(N);
19404 unsigned NumElems = VT.getVectorNumElements();
19405 if (N.getValueType().is64BitVector()) {
19406 MVT ElementTy = VT.getVectorElementType();
19407 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19408 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19409 }
19410
19411 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19412 DAG.getConstant(NumElems, DL, MVT::i64));
19413}
19414
19416 if (N.getOpcode() == ISD::BITCAST)
19417 N = N.getOperand(0);
19418 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19419 return false;
19420 if (N.getOperand(0).getValueType().isScalableVector())
19421 return false;
19422 return N.getConstantOperandAPInt(1) ==
19423 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19424}
19425
19426/// Helper structure to keep track of ISD::SET_CC operands.
19431};
19432
19433/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19435 const SDValue *Cmp;
19437};
19438
19439/// Helper structure to keep track of SetCC information.
19443};
19444
19445/// Helper structure to be able to read SetCC information. If set to
19446/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19447/// GenericSetCCInfo.
19451};
19452
19453/// Check whether or not \p Op is a SET_CC operation, either a generic or
19454/// an
19455/// AArch64 lowered one.
19456/// \p SetCCInfo is filled accordingly.
19457/// \post SetCCInfo is meanginfull only when this function returns true.
19458/// \return True when Op is a kind of SET_CC operation.
19460 // If this is a setcc, this is straight forward.
19461 if (Op.getOpcode() == ISD::SETCC) {
19462 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19463 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19464 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19465 SetCCInfo.IsAArch64 = false;
19466 return true;
19467 }
19468 // Otherwise, check if this is a matching csel instruction.
19469 // In other words:
19470 // - csel 1, 0, cc
19471 // - csel 0, 1, !cc
19472 if (Op.getOpcode() != AArch64ISD::CSEL)
19473 return false;
19474 // Set the information about the operands.
19475 // TODO: we want the operands of the Cmp not the csel
19476 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19477 SetCCInfo.IsAArch64 = true;
19478 SetCCInfo.Info.AArch64.CC =
19479 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19480
19481 // Check that the operands matches the constraints:
19482 // (1) Both operands must be constants.
19483 // (2) One must be 1 and the other must be 0.
19484 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19485 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19486
19487 // Check (1).
19488 if (!TValue || !FValue)
19489 return false;
19490
19491 // Check (2).
19492 if (!TValue->isOne()) {
19493 // Update the comparison when we are interested in !cc.
19494 std::swap(TValue, FValue);
19495 SetCCInfo.Info.AArch64.CC =
19497 }
19498 return TValue->isOne() && FValue->isZero();
19499}
19500
19501// Returns true if Op is setcc or zext of setcc.
19502static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19503 if (isSetCC(Op, Info))
19504 return true;
19505 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19506 isSetCC(Op->getOperand(0), Info));
19507}
19508
19509// The folding we want to perform is:
19510// (add x, [zext] (setcc cc ...) )
19511// -->
19512// (csel x, (add x, 1), !cc ...)
19513//
19514// The latter will get matched to a CSINC instruction.
19516 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19517 SDValue LHS = Op->getOperand(0);
19518 SDValue RHS = Op->getOperand(1);
19519 SetCCInfoAndKind InfoAndKind;
19520
19521 // If both operands are a SET_CC, then we don't want to perform this
19522 // folding and create another csel as this results in more instructions
19523 // (and higher register usage).
19524 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19525 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19526 return SDValue();
19527
19528 // If neither operand is a SET_CC, give up.
19529 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19530 std::swap(LHS, RHS);
19531 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19532 return SDValue();
19533 }
19534
19535 // FIXME: This could be generatized to work for FP comparisons.
19536 EVT CmpVT = InfoAndKind.IsAArch64
19537 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19538 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19539 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19540 return SDValue();
19541
19542 SDValue CCVal;
19543 SDValue Cmp;
19544 SDLoc dl(Op);
19545 if (InfoAndKind.IsAArch64) {
19546 CCVal = DAG.getConstant(
19548 MVT::i32);
19549 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19550 } else
19551 Cmp = getAArch64Cmp(
19552 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19553 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19554 dl);
19555
19556 EVT VT = Op->getValueType(0);
19557 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19558 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19559}
19560
19561// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19563 EVT VT = N->getValueType(0);
19564 // Only scalar integer and vector types.
19565 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19566 return SDValue();
19567
19568 SDValue LHS = N->getOperand(0);
19569 SDValue RHS = N->getOperand(1);
19570 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19571 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19572 return SDValue();
19573
19574 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19575 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19576 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19577 return SDValue();
19578
19579 SDValue Op1 = LHS->getOperand(0);
19580 SDValue Op2 = RHS->getOperand(0);
19581 EVT OpVT1 = Op1.getValueType();
19582 EVT OpVT2 = Op2.getValueType();
19583 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19584 Op2.getOpcode() != AArch64ISD::UADDV ||
19585 OpVT1.getVectorElementType() != VT)
19586 return SDValue();
19587
19588 SDValue Val1 = Op1.getOperand(0);
19589 SDValue Val2 = Op2.getOperand(0);
19590 EVT ValVT = Val1->getValueType(0);
19591 SDLoc DL(N);
19592 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19593 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19594 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19595 DAG.getConstant(0, DL, MVT::i64));
19596}
19597
19598/// Perform the scalar expression combine in the form of:
19599/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19600/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19602 EVT VT = N->getValueType(0);
19603 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19604 return SDValue();
19605
19606 SDValue LHS = N->getOperand(0);
19607 SDValue RHS = N->getOperand(1);
19608
19609 // Handle commutivity.
19610 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19611 LHS.getOpcode() != AArch64ISD::CSNEG) {
19612 std::swap(LHS, RHS);
19613 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19614 LHS.getOpcode() != AArch64ISD::CSNEG) {
19615 return SDValue();
19616 }
19617 }
19618
19619 if (!LHS.hasOneUse())
19620 return SDValue();
19621
19622 AArch64CC::CondCode AArch64CC =
19623 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19624
19625 // The CSEL should include a const one operand, and the CSNEG should include
19626 // One or NegOne operand.
19627 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19628 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19629 if (!CTVal || !CFVal)
19630 return SDValue();
19631
19632 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19633 (CTVal->isOne() || CFVal->isOne())) &&
19634 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19635 (CTVal->isOne() || CFVal->isAllOnes())))
19636 return SDValue();
19637
19638 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19639 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19640 !CFVal->isOne()) {
19641 std::swap(CTVal, CFVal);
19642 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19643 }
19644
19645 SDLoc DL(N);
19646 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19647 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19648 !CFVal->isAllOnes()) {
19649 APInt C = -1 * CFVal->getAPIntValue();
19650 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19651 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19652 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19653 }
19654
19655 // It might be neutral for larger constants, as the immediate need to be
19656 // materialized in a register.
19657 APInt ADDC = CTVal->getAPIntValue();
19658 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19659 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19660 return SDValue();
19661
19662 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19663 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19664 "Unexpected constant value");
19665
19666 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19667 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19668 SDValue Cmp = LHS.getOperand(3);
19669
19670 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19671}
19672
19673// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19675 EVT VT = N->getValueType(0);
19676 if (N->getOpcode() != ISD::ADD)
19677 return SDValue();
19678
19679 SDValue Dot = N->getOperand(0);
19680 SDValue A = N->getOperand(1);
19681 // Handle commutivity
19682 auto isZeroDot = [](SDValue Dot) {
19683 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19684 Dot.getOpcode() == AArch64ISD::SDOT) &&
19686 };
19687 if (!isZeroDot(Dot))
19688 std::swap(Dot, A);
19689 if (!isZeroDot(Dot))
19690 return SDValue();
19691
19692 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19693 Dot.getOperand(2));
19694}
19695
19697 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19698}
19699
19701 SDLoc DL(Op);
19702 EVT VT = Op.getValueType();
19703 SDValue Zero = DAG.getConstant(0, DL, VT);
19704 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19705}
19706
19707// Try to fold
19708//
19709// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19710//
19711// The folding helps csel to be matched with csneg without generating
19712// redundant neg instruction, which includes negation of the csel expansion
19713// of abs node lowered by lowerABS.
19715 if (!isNegatedInteger(SDValue(N, 0)))
19716 return SDValue();
19717
19718 SDValue CSel = N->getOperand(1);
19719 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19720 return SDValue();
19721
19722 SDValue N0 = CSel.getOperand(0);
19723 SDValue N1 = CSel.getOperand(1);
19724
19725 // If both of them is not negations, it's not worth the folding as it
19726 // introduces two additional negations while reducing one negation.
19727 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19728 return SDValue();
19729
19730 SDValue N0N = getNegatedInteger(N0, DAG);
19731 SDValue N1N = getNegatedInteger(N1, DAG);
19732
19733 SDLoc DL(N);
19734 EVT VT = CSel.getValueType();
19735 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19736 CSel.getOperand(3));
19737}
19738
19739// The basic add/sub long vector instructions have variants with "2" on the end
19740// which act on the high-half of their inputs. They are normally matched by
19741// patterns like:
19742//
19743// (add (zeroext (extract_high LHS)),
19744// (zeroext (extract_high RHS)))
19745// -> uaddl2 vD, vN, vM
19746//
19747// However, if one of the extracts is something like a duplicate, this
19748// instruction can still be used profitably. This function puts the DAG into a
19749// more appropriate form for those patterns to trigger.
19752 SelectionDAG &DAG = DCI.DAG;
19753 if (DCI.isBeforeLegalizeOps())
19754 return SDValue();
19755
19756 MVT VT = N->getSimpleValueType(0);
19757 if (!VT.is128BitVector()) {
19758 if (N->getOpcode() == ISD::ADD)
19759 return performSetccAddFolding(N, DAG);
19760 return SDValue();
19761 }
19762
19763 // Make sure both branches are extended in the same way.
19764 SDValue LHS = N->getOperand(0);
19765 SDValue RHS = N->getOperand(1);
19766 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19767 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19768 LHS.getOpcode() != RHS.getOpcode())
19769 return SDValue();
19770
19771 unsigned ExtType = LHS.getOpcode();
19772
19773 // It's not worth doing if at least one of the inputs isn't already an
19774 // extract, but we don't know which it'll be so we have to try both.
19775 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19776 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19777 if (!RHS.getNode())
19778 return SDValue();
19779
19780 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19781 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19782 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19783 if (!LHS.getNode())
19784 return SDValue();
19785
19786 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19787 }
19788
19789 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19790}
19791
19792static bool isCMP(SDValue Op) {
19793 return Op.getOpcode() == AArch64ISD::SUBS &&
19794 !Op.getNode()->hasAnyUseOfValue(0);
19795}
19796
19797// (CSEL 1 0 CC Cond) => CC
19798// (CSEL 0 1 CC Cond) => !CC
19799static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19800 if (Op.getOpcode() != AArch64ISD::CSEL)
19801 return std::nullopt;
19802 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19803 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19804 return std::nullopt;
19805 SDValue OpLHS = Op.getOperand(0);
19806 SDValue OpRHS = Op.getOperand(1);
19807 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19808 return CC;
19809 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19810 return getInvertedCondCode(CC);
19811
19812 return std::nullopt;
19813}
19814
19815// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19816// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19817static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19818 SDValue CmpOp = Op->getOperand(2);
19819 if (!isCMP(CmpOp))
19820 return SDValue();
19821
19822 if (IsAdd) {
19823 if (!isOneConstant(CmpOp.getOperand(1)))
19824 return SDValue();
19825 } else {
19826 if (!isNullConstant(CmpOp.getOperand(0)))
19827 return SDValue();
19828 }
19829
19830 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19831 auto CC = getCSETCondCode(CsetOp);
19832 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19833 return SDValue();
19834
19835 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19836 Op->getOperand(0), Op->getOperand(1),
19837 CsetOp.getOperand(3));
19838}
19839
19840// (ADC x 0 cond) => (CINC x HS cond)
19842 SDValue LHS = N->getOperand(0);
19843 SDValue RHS = N->getOperand(1);
19844 SDValue Cond = N->getOperand(2);
19845
19846 if (!isNullConstant(RHS))
19847 return SDValue();
19848
19849 EVT VT = N->getValueType(0);
19850 SDLoc DL(N);
19851
19852 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19853 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19854 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19855}
19856
19859 SelectionDAG &DAG) {
19860 SDLoc DL(N);
19861 EVT VT = N->getValueType(0);
19862
19864 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
19865 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19866 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19867 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19868 Elt1->getOpcode() == ISD::FP_ROUND &&
19869 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19870 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19871 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19873 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19874 // Constant index.
19875 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19876 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19877 Elt0->getOperand(0)->getOperand(0) ==
19878 Elt1->getOperand(0)->getOperand(0) &&
19879 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19880 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19881 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19882 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19883 SDValue HighLanes;
19884 if (Elt2->getOpcode() == ISD::UNDEF &&
19885 Elt3->getOpcode() == ISD::UNDEF) {
19886 HighLanes = DAG.getUNDEF(MVT::v2f32);
19887 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19888 Elt3->getOpcode() == ISD::FP_ROUND &&
19889 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19890 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19891 Elt2->getConstantOperandVal(1) ==
19892 Elt3->getConstantOperandVal(1) &&
19893 Elt2->getOperand(0)->getOpcode() ==
19895 Elt3->getOperand(0)->getOpcode() ==
19897 // Constant index.
19898 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19899 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19900 Elt2->getOperand(0)->getOperand(0) ==
19901 Elt3->getOperand(0)->getOperand(0) &&
19902 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19903 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19904 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19905 HighLanes =
19906 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19907 }
19908 if (HighLanes) {
19909 SDValue DoubleToSingleSticky =
19910 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19911 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19912 DoubleToSingleSticky, HighLanes);
19913 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19914 Elt0->getOperand(1));
19915 }
19916 }
19917 }
19918 }
19919
19920 if (VT == MVT::v2f64) {
19921 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19922 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19923 Elt1->getOpcode() == ISD::FP_EXTEND &&
19925 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19926 Elt0->getOperand(0)->getOperand(0) ==
19927 Elt1->getOperand(0)->getOperand(0) &&
19928 // Constant index.
19929 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19930 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19931 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19932 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19933 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19934 // ResultType's known minimum vector length.
19935 Elt0->getOperand(0)->getConstantOperandVal(1) %
19937 0) {
19938 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19939 if (SrcVec.getValueType() == MVT::v4f16 ||
19940 SrcVec.getValueType() == MVT::v4bf16) {
19941 SDValue HalfToSingle =
19942 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19943 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19944 SDValue Extract = DAG.getNode(
19946 HalfToSingle, SubvectorIdx);
19947 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19948 }
19949 }
19950 }
19951
19952 // A build vector of two extracted elements is equivalent to an
19953 // extract subvector where the inner vector is any-extended to the
19954 // extract_vector_elt VT.
19955 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19956 // (extract_elt_iXX_to_i32 vec Idx+1))
19957 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19958
19959 // For now, only consider the v2i32 case, which arises as a result of
19960 // legalization.
19961 if (VT != MVT::v2i32)
19962 return SDValue();
19963
19964 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19965 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19966 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19967 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19968 // Constant index.
19969 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19970 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19971 // Both EXTRACT_VECTOR_ELT from same vector...
19972 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19973 // ... and contiguous. First element's index +1 == second element's index.
19974 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19975 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19976 // ResultType's known minimum vector length.
19977 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19978 SDValue VecToExtend = Elt0->getOperand(0);
19979 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19980 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19981 return SDValue();
19982
19983 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19984
19985 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19986 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19987 SubvectorIdx);
19988 }
19989
19990 return SDValue();
19991}
19992
19994 SelectionDAG &DAG) {
19995 EVT VT = N->getValueType(0);
19996 SDValue N0 = N->getOperand(0);
19997 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19998 N0.getOpcode() == AArch64ISD::DUP) {
19999 SDValue Op = N0.getOperand(0);
20000 if (VT.getScalarType() == MVT::i32 &&
20001 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20002 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
20003 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
20004 }
20005
20006 return SDValue();
20007}
20008
20009// Check an node is an extend or shift operand
20011 unsigned Opcode = N.getOpcode();
20012 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20013 EVT SrcVT;
20014 if (Opcode == ISD::SIGN_EXTEND_INREG)
20015 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
20016 else
20017 SrcVT = N.getOperand(0).getValueType();
20018
20019 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20020 } else if (Opcode == ISD::AND) {
20021 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
20022 if (!CSD)
20023 return false;
20024 uint64_t AndMask = CSD->getZExtValue();
20025 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20026 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20027 return isa<ConstantSDNode>(N.getOperand(1));
20028 }
20029
20030 return false;
20031}
20032
20033// (N - Y) + Z --> (Z - Y) + N
20034// when N is an extend or shift operand
20036 SelectionDAG &DAG) {
20037 auto IsOneUseExtend = [](SDValue N) {
20038 return N.hasOneUse() && isExtendOrShiftOperand(N);
20039 };
20040
20041 // DAGCombiner will revert the combination when Z is constant cause
20042 // dead loop. So don't enable the combination when Z is constant.
20043 // If Z is one use shift C, we also can't do the optimization.
20044 // It will falling to self infinite loop.
20045 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
20046 return SDValue();
20047
20048 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20049 return SDValue();
20050
20051 SDValue Shift = SUB.getOperand(0);
20052 if (!IsOneUseExtend(Shift))
20053 return SDValue();
20054
20055 SDLoc DL(N);
20056 EVT VT = N->getValueType(0);
20057
20058 SDValue Y = SUB.getOperand(1);
20059 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
20060 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
20061}
20062
20064 SelectionDAG &DAG) {
20065 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20066 // commutative.
20067 if (N->getOpcode() != ISD::ADD)
20068 return SDValue();
20069
20070 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20071 // shifted register is only available for i32 and i64.
20072 EVT VT = N->getValueType(0);
20073 if (VT != MVT::i32 && VT != MVT::i64)
20074 return SDValue();
20075
20076 SDLoc DL(N);
20077 SDValue LHS = N->getOperand(0);
20078 SDValue RHS = N->getOperand(1);
20079
20080 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
20081 return Val;
20082 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
20083 return Val;
20084
20085 uint64_t LHSImm = 0, RHSImm = 0;
20086 // If both operand are shifted by imm and shift amount is not greater than 4
20087 // for one operand, swap LHS and RHS to put operand with smaller shift amount
20088 // on RHS.
20089 //
20090 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20091 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20092 // with LSL (shift > 4). For the rest of processors, this is no-op for
20093 // performance or correctness.
20094 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
20095 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
20096 RHSImm > 4 && LHS.hasOneUse())
20097 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
20098
20099 return SDValue();
20100}
20101
20102// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20103// This reassociates it back to allow the creation of more mls instructions.
20105 if (N->getOpcode() != ISD::SUB)
20106 return SDValue();
20107
20108 SDValue Add = N->getOperand(1);
20109 SDValue X = N->getOperand(0);
20110 if (Add.getOpcode() != ISD::ADD)
20111 return SDValue();
20112
20113 if (!Add.hasOneUse())
20114 return SDValue();
20116 return SDValue();
20117
20118 SDValue M1 = Add.getOperand(0);
20119 SDValue M2 = Add.getOperand(1);
20120 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
20121 M1.getOpcode() != AArch64ISD::UMULL)
20122 return SDValue();
20123 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
20125 return SDValue();
20126
20127 EVT VT = N->getValueType(0);
20128 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
20129 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
20130}
20131
20132// Combine into mla/mls.
20133// This works on the patterns of:
20134// add v1, (mul v2, v3)
20135// sub v1, (mul v2, v3)
20136// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
20137// It will transform the add/sub to a scalable version, so that we can
20138// make use of SVE's MLA/MLS that will be generated for that pattern
20139static SDValue
20141 SelectionDAG &DAG = DCI.DAG;
20142 // Make sure that the types are legal
20143 if (!DCI.isAfterLegalizeDAG())
20144 return SDValue();
20145 // Before using SVE's features, check first if it's available.
20146 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
20147 return SDValue();
20148
20149 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
20150 return SDValue();
20151
20152 if (!N->getValueType(0).isFixedLengthVector())
20153 return SDValue();
20154
20155 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
20156 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20157 return SDValue();
20158
20159 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
20160 return SDValue();
20161
20162 SDValue MulValue = Op1->getOperand(0);
20163 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
20164 return SDValue();
20165
20166 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
20167 return SDValue();
20168
20169 EVT ScalableVT = MulValue.getValueType();
20170 if (!ScalableVT.isScalableVector())
20171 return SDValue();
20172
20173 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
20174 SDValue NewValue =
20175 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
20176 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
20177 };
20178
20179 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
20180 return res;
20181 else if (N->getOpcode() == ISD::ADD)
20182 return performOpt(N->getOperand(1), N->getOperand(0));
20183
20184 return SDValue();
20185}
20186
20187// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
20188// help, for example, to produce ssra from sshr+add.
20190 EVT VT = N->getValueType(0);
20191 if (VT != MVT::i64 ||
20192 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
20193 return SDValue();
20194 SDValue Op0 = N->getOperand(0);
20195 SDValue Op1 = N->getOperand(1);
20196
20197 // At least one of the operands should be an extract, and the other should be
20198 // something that is easy to convert to v1i64 type (in this case a load).
20199 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20200 Op0.getOpcode() != ISD::LOAD)
20201 return SDValue();
20202 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20203 Op1.getOpcode() != ISD::LOAD)
20204 return SDValue();
20205
20206 SDLoc DL(N);
20207 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20208 Op0.getOperand(0).getValueType() == MVT::v1i64) {
20209 Op0 = Op0.getOperand(0);
20210 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
20211 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20212 Op1.getOperand(0).getValueType() == MVT::v1i64) {
20213 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
20214 Op1 = Op1.getOperand(0);
20215 } else
20216 return SDValue();
20217
20218 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
20219 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
20220 DAG.getConstant(0, DL, MVT::i64));
20221}
20222
20225 if (!BV->hasOneUse())
20226 return false;
20227 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
20228 if (!Ld || !Ld->isSimple())
20229 return false;
20230 Loads.push_back(Ld);
20231 return true;
20232 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
20234 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
20235 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
20236 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
20237 return false;
20238 Loads.push_back(Ld);
20239 }
20240 return true;
20241 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
20242 // Try to find a tree of shuffles and concats from how IR shuffles of loads
20243 // are lowered. Note that this only comes up because we do not always visit
20244 // operands before uses. After that is fixed this can be removed and in the
20245 // meantime this is fairly specific to the lowering we expect from IR.
20246 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
20247 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
20248 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
20249 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
20250 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
20251 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
20252 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
20253 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
20254 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
20255 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
20256 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
20257 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
20258 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
20259 B.getOperand(1).getNumOperands() != 4)
20260 return false;
20261 auto SV1 = cast<ShuffleVectorSDNode>(B);
20262 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
20263 int NumElts = B.getValueType().getVectorNumElements();
20264 int NumSubElts = NumElts / 4;
20265 for (int I = 0; I < NumSubElts; I++) {
20266 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
20267 if (SV1->getMaskElt(I) != I ||
20268 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
20269 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
20270 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
20271 return false;
20272 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
20273 if (SV2->getMaskElt(I) != I ||
20274 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
20275 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
20276 return false;
20277 }
20278 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
20279 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
20280 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
20281 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
20282 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
20283 !Ld2->isSimple() || !Ld3->isSimple())
20284 return false;
20285 Loads.push_back(Ld0);
20286 Loads.push_back(Ld1);
20287 Loads.push_back(Ld2);
20288 Loads.push_back(Ld3);
20289 return true;
20290 }
20291 return false;
20292}
20293
20295 SelectionDAG &DAG,
20296 unsigned &NumSubLoads) {
20297 if (!Op0.hasOneUse() || !Op1.hasOneUse())
20298 return false;
20299
20300 SmallVector<LoadSDNode *> Loads0, Loads1;
20301 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20302 isLoadOrMultipleLoads(Op1, Loads1)) {
20303 if (NumSubLoads && Loads0.size() != NumSubLoads)
20304 return false;
20305 NumSubLoads = Loads0.size();
20306 return Loads0.size() == Loads1.size() &&
20307 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
20308 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
20309 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
20310 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
20311 Size / 8, 1);
20312 });
20313 }
20314
20315 if (Op0.getOpcode() != Op1.getOpcode())
20316 return false;
20317
20318 switch (Op0.getOpcode()) {
20319 case ISD::ADD:
20320 case ISD::SUB:
20322 DAG, NumSubLoads) &&
20324 DAG, NumSubLoads);
20325 case ISD::SIGN_EXTEND:
20326 case ISD::ANY_EXTEND:
20327 case ISD::ZERO_EXTEND:
20328 EVT XVT = Op0.getOperand(0).getValueType();
20329 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
20330 XVT.getScalarSizeInBits() != 32)
20331 return false;
20333 DAG, NumSubLoads);
20334 }
20335 return false;
20336}
20337
20338// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
20339// into a single load of twice the size, that we extract the bottom part and top
20340// part so that the shl can use a shll2 instruction. The two loads in that
20341// example can also be larger trees of instructions, which are identical except
20342// for the leaves which are all loads offset from the LHS, including
20343// buildvectors of multiple loads. For example the RHS tree could be
20344// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
20345// Whilst it can be common for the larger loads to replace LDP instructions
20346// (which doesn't gain anything on it's own), the larger loads can help create
20347// more efficient code, and in buildvectors prevent the need for ld1 lane
20348// inserts which can be slower than normal loads.
20350 EVT VT = N->getValueType(0);
20351 if (!VT.isFixedLengthVector() ||
20352 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
20353 VT.getScalarSizeInBits() != 64))
20354 return SDValue();
20355
20356 SDValue Other = N->getOperand(0);
20357 SDValue Shift = N->getOperand(1);
20358 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20359 std::swap(Shift, Other);
20360 APInt ShiftAmt;
20361 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20362 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
20363 return SDValue();
20364
20365 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
20366 !ISD::isExtOpcode(Other.getOpcode()) ||
20367 Shift.getOperand(0).getOperand(0).getValueType() !=
20368 Other.getOperand(0).getValueType() ||
20369 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20370 return SDValue();
20371
20372 SDValue Op0 = Other.getOperand(0);
20373 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20374
20375 unsigned NumSubLoads = 0;
20376 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20377 return SDValue();
20378
20379 // Attempt to rule out some unprofitable cases using heuristics (some working
20380 // around suboptimal code generation), notably if the extend not be able to
20381 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20382 // will need to be created which can increase the instruction count.
20383 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20384 unsigned NumSubElts = NumElts / NumSubLoads;
20385 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20386 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20387 Op0.getValueType().getSizeInBits() < 128 &&
20389 return SDValue();
20390
20391 // Recreate the tree with the new combined loads.
20392 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20393 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20394 EVT DVT =
20396
20397 SmallVector<LoadSDNode *> Loads0, Loads1;
20398 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20399 isLoadOrMultipleLoads(Op1, Loads1)) {
20400 EVT LoadVT = EVT::getVectorVT(
20401 *DAG.getContext(), Op0.getValueType().getScalarType(),
20402 Op0.getValueType().getVectorNumElements() / Loads0.size());
20403 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20404
20405 SmallVector<SDValue> NewLoads;
20406 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20407 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20408 L0->getBasePtr(), L0->getPointerInfo(),
20409 L0->getOriginalAlign());
20410 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20411 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20412 NewLoads.push_back(Load);
20413 }
20414 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20415 }
20416
20418 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20419 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20420 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20421 };
20422 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20423
20424 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20425 int Hi = NumSubElts, Lo = 0;
20426 for (unsigned i = 0; i < NumSubLoads; i++) {
20427 for (unsigned j = 0; j < NumSubElts; j++) {
20428 LowMask[i * NumSubElts + j] = Lo++;
20429 HighMask[i * NumSubElts + j] = Hi++;
20430 }
20431 Lo += NumSubElts;
20432 Hi += NumSubElts;
20433 }
20434 SDLoc DL(N);
20435 SDValue Ext0, Ext1;
20436 // Extract the top and bottom lanes, then extend the result. Possibly extend
20437 // the result then extract the lanes if the two operands match as it produces
20438 // slightly smaller code.
20439 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20441 NewOp, DAG.getConstant(0, DL, MVT::i64));
20442 SDValue SubH =
20443 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20444 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20445 SDValue Extr0 =
20446 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20447 SDValue Extr1 =
20448 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20449 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20450 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20451 } else {
20453 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20454 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20455 DAG.getConstant(0, DL, MVT::i64));
20456 SDValue SubH =
20457 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20458 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20459 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20460 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20461 }
20462 SDValue NShift =
20463 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20464 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20465}
20466
20469 // Try to change sum of two reductions.
20470 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20471 return Val;
20472 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20473 return Val;
20474 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20475 return Val;
20476 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20477 return Val;
20478 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
20479 return Val;
20481 return Val;
20482 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20483 return Val;
20484 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20485 return Val;
20486 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20487 return Val;
20488
20489 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20490 return Val;
20491
20492 return performAddSubLongCombine(N, DCI);
20493}
20494
20495// Massage DAGs which we can use the high-half "long" operations on into
20496// something isel will recognize better. E.g.
20497//
20498// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20499// (aarch64_neon_umull (extract_high (v2i64 vec)))
20500// (extract_high (v2i64 (dup128 scalar)))))
20501//
20504 SelectionDAG &DAG) {
20505 if (DCI.isBeforeLegalizeOps())
20506 return SDValue();
20507
20508 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20509 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20510 assert(LHS.getValueType().is64BitVector() &&
20511 RHS.getValueType().is64BitVector() &&
20512 "unexpected shape for long operation");
20513
20514 // Either node could be a DUP, but it's not worth doing both of them (you'd
20515 // just as well use the non-high version) so look for a corresponding extract
20516 // operation on the other "wing".
20519 if (!RHS.getNode())
20520 return SDValue();
20523 if (!LHS.getNode())
20524 return SDValue();
20525 } else
20526 return SDValue();
20527
20528 if (IID == Intrinsic::not_intrinsic)
20529 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20530
20531 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20532 N->getOperand(0), LHS, RHS);
20533}
20534
20535static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20536 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20537 unsigned ElemBits = ElemTy.getSizeInBits();
20538
20539 int64_t ShiftAmount;
20540 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20541 APInt SplatValue, SplatUndef;
20542 unsigned SplatBitSize;
20543 bool HasAnyUndefs;
20544 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20545 HasAnyUndefs, ElemBits) ||
20546 SplatBitSize != ElemBits)
20547 return SDValue();
20548
20549 ShiftAmount = SplatValue.getSExtValue();
20550 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20551 ShiftAmount = CVN->getSExtValue();
20552 } else
20553 return SDValue();
20554
20555 // If the shift amount is zero, remove the shift intrinsic.
20556 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20557 return N->getOperand(1);
20558
20559 unsigned Opcode;
20560 bool IsRightShift;
20561 switch (IID) {
20562 default:
20563 llvm_unreachable("Unknown shift intrinsic");
20564 case Intrinsic::aarch64_neon_sqshl:
20565 Opcode = AArch64ISD::SQSHL_I;
20566 IsRightShift = false;
20567 break;
20568 case Intrinsic::aarch64_neon_uqshl:
20569 Opcode = AArch64ISD::UQSHL_I;
20570 IsRightShift = false;
20571 break;
20572 case Intrinsic::aarch64_neon_srshl:
20573 Opcode = AArch64ISD::SRSHR_I;
20574 IsRightShift = true;
20575 break;
20576 case Intrinsic::aarch64_neon_urshl:
20577 Opcode = AArch64ISD::URSHR_I;
20578 IsRightShift = true;
20579 break;
20580 case Intrinsic::aarch64_neon_sqshlu:
20581 Opcode = AArch64ISD::SQSHLU_I;
20582 IsRightShift = false;
20583 break;
20584 case Intrinsic::aarch64_neon_sshl:
20585 case Intrinsic::aarch64_neon_ushl:
20586 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20587 // left shift for positive shift amounts. For negative shifts we can use a
20588 // VASHR/VLSHR as appropiate.
20589 if (ShiftAmount < 0) {
20590 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20592 ShiftAmount = -ShiftAmount;
20593 } else
20594 Opcode = AArch64ISD::VSHL;
20595 IsRightShift = false;
20596 break;
20597 }
20598
20599 EVT VT = N->getValueType(0);
20600 SDValue Op = N->getOperand(1);
20601 SDLoc dl(N);
20602 if (VT == MVT::i64) {
20603 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20604 VT = MVT::v1i64;
20605 }
20606
20607 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20608 Op = DAG.getNode(Opcode, dl, VT, Op,
20609 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20610 if (N->getValueType(0) == MVT::i64)
20611 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20612 DAG.getConstant(0, dl, MVT::i64));
20613 return Op;
20614 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20615 Op = DAG.getNode(Opcode, dl, VT, Op,
20616 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20617 if (N->getValueType(0) == MVT::i64)
20618 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20619 DAG.getConstant(0, dl, MVT::i64));
20620 return Op;
20621 }
20622
20623 return SDValue();
20624}
20625
20626// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20627// the intrinsics must be legal and take an i32, this means there's almost
20628// certainly going to be a zext in the DAG which we can eliminate.
20629static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20630 SDValue AndN = N->getOperand(2);
20631 if (AndN.getOpcode() != ISD::AND)
20632 return SDValue();
20633
20634 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20635 if (!CMask || CMask->getZExtValue() != Mask)
20636 return SDValue();
20637
20638 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20639 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20640}
20641
20643 SelectionDAG &DAG) {
20644 SDLoc dl(N);
20645 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20646 DAG.getNode(Opc, dl,
20647 N->getOperand(1).getSimpleValueType(),
20648 N->getOperand(1)),
20649 DAG.getConstant(0, dl, MVT::i64));
20650}
20651
20653 SDLoc DL(N);
20654 SDValue Op1 = N->getOperand(1);
20655 SDValue Op2 = N->getOperand(2);
20656 EVT ScalarTy = Op2.getValueType();
20657 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20658 ScalarTy = MVT::i32;
20659
20660 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20661 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20662 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20663 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20664 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20665 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20666}
20667
20669 SDLoc dl(N);
20670 SDValue Scalar = N->getOperand(3);
20671 EVT ScalarTy = Scalar.getValueType();
20672
20673 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20674 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20675
20676 SDValue Passthru = N->getOperand(1);
20677 SDValue Pred = N->getOperand(2);
20678 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20679 Pred, Scalar, Passthru);
20680}
20681
20683 SDLoc dl(N);
20684 LLVMContext &Ctx = *DAG.getContext();
20685 EVT VT = N->getValueType(0);
20686
20687 assert(VT.isScalableVector() && "Expected a scalable vector.");
20688
20689 // Current lowering only supports the SVE-ACLE types.
20691 return SDValue();
20692
20693 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20694 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20695 EVT ByteVT =
20696 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20697
20698 // Convert everything to the domain of EXT (i.e bytes).
20699 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20700 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20701 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20702 DAG.getConstant(ElemSize, dl, MVT::i32));
20703
20704 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20705 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20706}
20707
20710 SelectionDAG &DAG) {
20711 if (DCI.isBeforeLegalize())
20712 return SDValue();
20713
20714 SDValue Comparator = N->getOperand(3);
20715 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20716 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20717 unsigned IID = getIntrinsicID(N);
20718 EVT VT = N->getValueType(0);
20719 EVT CmpVT = N->getOperand(2).getValueType();
20720 SDValue Pred = N->getOperand(1);
20721 SDValue Imm;
20722 SDLoc DL(N);
20723
20724 switch (IID) {
20725 default:
20726 llvm_unreachable("Called with wrong intrinsic!");
20727 break;
20728
20729 // Signed comparisons
20730 case Intrinsic::aarch64_sve_cmpeq_wide:
20731 case Intrinsic::aarch64_sve_cmpne_wide:
20732 case Intrinsic::aarch64_sve_cmpge_wide:
20733 case Intrinsic::aarch64_sve_cmpgt_wide:
20734 case Intrinsic::aarch64_sve_cmplt_wide:
20735 case Intrinsic::aarch64_sve_cmple_wide: {
20736 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20737 int64_t ImmVal = CN->getSExtValue();
20738 if (ImmVal >= -16 && ImmVal <= 15)
20739 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20740 else
20741 return SDValue();
20742 }
20743 break;
20744 }
20745 // Unsigned comparisons
20746 case Intrinsic::aarch64_sve_cmphs_wide:
20747 case Intrinsic::aarch64_sve_cmphi_wide:
20748 case Intrinsic::aarch64_sve_cmplo_wide:
20749 case Intrinsic::aarch64_sve_cmpls_wide: {
20750 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20751 uint64_t ImmVal = CN->getZExtValue();
20752 if (ImmVal <= 127)
20753 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20754 else
20755 return SDValue();
20756 }
20757 break;
20758 }
20759 }
20760
20761 if (!Imm)
20762 return SDValue();
20763
20764 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20765 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20766 N->getOperand(2), Splat, DAG.getCondCode(CC));
20767 }
20768
20769 return SDValue();
20770}
20771
20774 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20775
20776 SDLoc DL(Op);
20777 assert(Op.getValueType().isScalableVector() &&
20778 TLI.isTypeLegal(Op.getValueType()) &&
20779 "Expected legal scalable vector type!");
20780 assert(Op.getValueType() == Pg.getValueType() &&
20781 "Expected same type for PTEST operands");
20782
20783 // Ensure target specific opcodes are using legal type.
20784 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20785 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20786 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20787
20788 // Ensure operands have type nxv16i1.
20789 if (Op.getValueType() != MVT::nxv16i1) {
20792 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20793 else
20794 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20795 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20796 }
20797
20798 // Set condition code (CC) flags.
20799 SDValue Test = DAG.getNode(
20801 DL, MVT::Other, Pg, Op);
20802
20803 // Convert CC to integer based on requested condition.
20804 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20805 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20806 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20807 return DAG.getZExtOrTrunc(Res, DL, VT);
20808}
20809
20811 SelectionDAG &DAG) {
20812 SDLoc DL(N);
20813
20814 SDValue Pred = N->getOperand(1);
20815 SDValue VecToReduce = N->getOperand(2);
20816
20817 // NOTE: The integer reduction's result type is not always linked to the
20818 // operand's element type so we construct it from the intrinsic's result type.
20819 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20820 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20821
20822 // SVE reductions set the whole vector register with the first element
20823 // containing the reduction result, which we'll now extract.
20824 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20825 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20826 Zero);
20827}
20828
20830 SelectionDAG &DAG) {
20831 SDLoc DL(N);
20832
20833 SDValue Pred = N->getOperand(1);
20834 SDValue VecToReduce = N->getOperand(2);
20835
20836 EVT ReduceVT = VecToReduce.getValueType();
20837 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20838
20839 // SVE reductions set the whole vector register with the first element
20840 // containing the reduction result, which we'll now extract.
20841 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20842 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20843 Zero);
20844}
20845
20847 SelectionDAG &DAG) {
20848 SDLoc DL(N);
20849
20850 SDValue Pred = N->getOperand(1);
20851 SDValue InitVal = N->getOperand(2);
20852 SDValue VecToReduce = N->getOperand(3);
20853 EVT ReduceVT = VecToReduce.getValueType();
20854
20855 // Ordered reductions use the first lane of the result vector as the
20856 // reduction's initial value.
20857 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20858 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20859 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20860
20861 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20862
20863 // SVE reductions set the whole vector register with the first element
20864 // containing the reduction result, which we'll now extract.
20865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20866 Zero);
20867}
20868
20869// If a merged operation has no inactive lanes we can relax it to a predicated
20870// or unpredicated operation, which potentially allows better isel (perhaps
20871// using immediate forms) or relaxing register reuse requirements.
20873 SelectionDAG &DAG, bool UnpredOp = false,
20874 bool SwapOperands = false) {
20875 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20876 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20877 SDValue Pg = N->getOperand(1);
20878 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20879 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20880
20881 // ISD way to specify an all active predicate.
20882 if (isAllActivePredicate(DAG, Pg)) {
20883 if (UnpredOp)
20884 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20885
20886 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20887 }
20888
20889 // FUTURE: SplatVector(true)
20890 return SDValue();
20891}
20892
20895 const AArch64Subtarget *Subtarget) {
20896 if (DCI.isBeforeLegalize())
20897 return SDValue();
20898
20899 if (!Subtarget->hasSVE2p1())
20900 return SDValue();
20901
20902 if (!N->hasNUsesOfValue(2, 0))
20903 return SDValue();
20904
20905 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
20906 if (HalfSize < 2)
20907 return SDValue();
20908
20909 auto It = N->use_begin();
20910 SDNode *Lo = *It++;
20911 SDNode *Hi = *It;
20912
20913 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
20914 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
20915 return SDValue();
20916
20917 uint64_t OffLo = Lo->getConstantOperandVal(1);
20918 uint64_t OffHi = Hi->getConstantOperandVal(1);
20919
20920 if (OffLo > OffHi) {
20921 std::swap(Lo, Hi);
20922 std::swap(OffLo, OffHi);
20923 }
20924
20925 if (OffLo != 0 || OffHi != HalfSize)
20926 return SDValue();
20927
20928 EVT HalfVec = Lo->getValueType(0);
20929 if (HalfVec != Hi->getValueType(0) ||
20930 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
20931 return SDValue();
20932
20933 SelectionDAG &DAG = DCI.DAG;
20934 SDLoc DL(N);
20935 SDValue ID =
20936 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
20937 SDValue Idx = N->getOperand(1);
20938 SDValue TC = N->getOperand(2);
20939 if (Idx.getValueType() != MVT::i64) {
20940 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
20941 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
20942 }
20943 auto R =
20945 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
20946
20947 DCI.CombineTo(Lo, R.getValue(0));
20948 DCI.CombineTo(Hi, R.getValue(1));
20949
20950 return SDValue(N, 0);
20951}
20952
20955 const AArch64Subtarget *Subtarget) {
20956 SelectionDAG &DAG = DCI.DAG;
20957 unsigned IID = getIntrinsicID(N);
20958 switch (IID) {
20959 default:
20960 break;
20961 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20962 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20963 return tryCombineFixedPointConvert(N, DCI, DAG);
20964 case Intrinsic::aarch64_neon_saddv:
20966 case Intrinsic::aarch64_neon_uaddv:
20968 case Intrinsic::aarch64_neon_sminv:
20970 case Intrinsic::aarch64_neon_uminv:
20972 case Intrinsic::aarch64_neon_smaxv:
20974 case Intrinsic::aarch64_neon_umaxv:
20976 case Intrinsic::aarch64_neon_fmax:
20977 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20978 N->getOperand(1), N->getOperand(2));
20979 case Intrinsic::aarch64_neon_fmin:
20980 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20981 N->getOperand(1), N->getOperand(2));
20982 case Intrinsic::aarch64_neon_fmaxnm:
20983 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20984 N->getOperand(1), N->getOperand(2));
20985 case Intrinsic::aarch64_neon_fminnm:
20986 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20987 N->getOperand(1), N->getOperand(2));
20988 case Intrinsic::aarch64_neon_smull:
20989 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20990 N->getOperand(1), N->getOperand(2));
20991 case Intrinsic::aarch64_neon_umull:
20992 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20993 N->getOperand(1), N->getOperand(2));
20994 case Intrinsic::aarch64_neon_pmull:
20995 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20996 N->getOperand(1), N->getOperand(2));
20997 case Intrinsic::aarch64_neon_sqdmull:
20998 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20999 case Intrinsic::aarch64_neon_sqshl:
21000 case Intrinsic::aarch64_neon_uqshl:
21001 case Intrinsic::aarch64_neon_sqshlu:
21002 case Intrinsic::aarch64_neon_srshl:
21003 case Intrinsic::aarch64_neon_urshl:
21004 case Intrinsic::aarch64_neon_sshl:
21005 case Intrinsic::aarch64_neon_ushl:
21006 return tryCombineShiftImm(IID, N, DAG);
21007 case Intrinsic::aarch64_neon_sabd:
21008 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
21009 N->getOperand(1), N->getOperand(2));
21010 case Intrinsic::aarch64_neon_uabd:
21011 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
21012 N->getOperand(1), N->getOperand(2));
21013 case Intrinsic::aarch64_crc32b:
21014 case Intrinsic::aarch64_crc32cb:
21015 return tryCombineCRC32(0xff, N, DAG);
21016 case Intrinsic::aarch64_crc32h:
21017 case Intrinsic::aarch64_crc32ch:
21018 return tryCombineCRC32(0xffff, N, DAG);
21019 case Intrinsic::aarch64_sve_saddv:
21020 // There is no i64 version of SADDV because the sign is irrelevant.
21021 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
21023 else
21025 case Intrinsic::aarch64_sve_uaddv:
21027 case Intrinsic::aarch64_sve_smaxv:
21029 case Intrinsic::aarch64_sve_umaxv:
21031 case Intrinsic::aarch64_sve_sminv:
21033 case Intrinsic::aarch64_sve_uminv:
21035 case Intrinsic::aarch64_sve_orv:
21037 case Intrinsic::aarch64_sve_eorv:
21039 case Intrinsic::aarch64_sve_andv:
21041 case Intrinsic::aarch64_sve_index:
21042 return LowerSVEIntrinsicIndex(N, DAG);
21043 case Intrinsic::aarch64_sve_dup:
21044 return LowerSVEIntrinsicDUP(N, DAG);
21045 case Intrinsic::aarch64_sve_dup_x:
21046 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
21047 N->getOperand(1));
21048 case Intrinsic::aarch64_sve_ext:
21049 return LowerSVEIntrinsicEXT(N, DAG);
21050 case Intrinsic::aarch64_sve_mul_u:
21051 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
21052 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21053 case Intrinsic::aarch64_sve_smulh_u:
21054 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
21055 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21056 case Intrinsic::aarch64_sve_umulh_u:
21057 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
21058 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21059 case Intrinsic::aarch64_sve_smin_u:
21060 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
21061 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21062 case Intrinsic::aarch64_sve_umin_u:
21063 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
21064 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21065 case Intrinsic::aarch64_sve_smax_u:
21066 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
21067 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21068 case Intrinsic::aarch64_sve_umax_u:
21069 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
21070 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21071 case Intrinsic::aarch64_sve_lsl_u:
21072 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
21073 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21074 case Intrinsic::aarch64_sve_lsr_u:
21075 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
21076 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21077 case Intrinsic::aarch64_sve_asr_u:
21078 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
21079 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21080 case Intrinsic::aarch64_sve_fadd_u:
21081 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
21082 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21083 case Intrinsic::aarch64_sve_fdiv_u:
21084 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
21085 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21086 case Intrinsic::aarch64_sve_fmax_u:
21087 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
21088 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21089 case Intrinsic::aarch64_sve_fmaxnm_u:
21090 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
21091 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21092 case Intrinsic::aarch64_sve_fmla_u:
21093 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
21094 N->getOperand(1), N->getOperand(3), N->getOperand(4),
21095 N->getOperand(2));
21096 case Intrinsic::aarch64_sve_fmin_u:
21097 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
21098 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21099 case Intrinsic::aarch64_sve_fminnm_u:
21100 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
21101 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21102 case Intrinsic::aarch64_sve_fmul_u:
21103 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
21104 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21105 case Intrinsic::aarch64_sve_fsub_u:
21106 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
21107 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21108 case Intrinsic::aarch64_sve_add_u:
21109 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
21110 N->getOperand(3));
21111 case Intrinsic::aarch64_sve_sub_u:
21112 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
21113 N->getOperand(3));
21114 case Intrinsic::aarch64_sve_subr:
21115 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
21116 case Intrinsic::aarch64_sve_and_u:
21117 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
21118 N->getOperand(3));
21119 case Intrinsic::aarch64_sve_bic_u:
21120 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
21121 N->getOperand(2), N->getOperand(3));
21122 case Intrinsic::aarch64_sve_eor_u:
21123 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
21124 N->getOperand(3));
21125 case Intrinsic::aarch64_sve_orr_u:
21126 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
21127 N->getOperand(3));
21128 case Intrinsic::aarch64_sve_sabd_u:
21129 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
21130 N->getOperand(2), N->getOperand(3));
21131 case Intrinsic::aarch64_sve_uabd_u:
21132 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
21133 N->getOperand(2), N->getOperand(3));
21134 case Intrinsic::aarch64_sve_sdiv_u:
21135 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
21136 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21137 case Intrinsic::aarch64_sve_udiv_u:
21138 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
21139 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21140 case Intrinsic::aarch64_sve_sqadd:
21141 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
21142 case Intrinsic::aarch64_sve_sqsub_u:
21143 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
21144 N->getOperand(2), N->getOperand(3));
21145 case Intrinsic::aarch64_sve_uqadd:
21146 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
21147 case Intrinsic::aarch64_sve_uqsub_u:
21148 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
21149 N->getOperand(2), N->getOperand(3));
21150 case Intrinsic::aarch64_sve_sqadd_x:
21151 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
21152 N->getOperand(1), N->getOperand(2));
21153 case Intrinsic::aarch64_sve_sqsub_x:
21154 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
21155 N->getOperand(1), N->getOperand(2));
21156 case Intrinsic::aarch64_sve_uqadd_x:
21157 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
21158 N->getOperand(1), N->getOperand(2));
21159 case Intrinsic::aarch64_sve_uqsub_x:
21160 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
21161 N->getOperand(1), N->getOperand(2));
21162 case Intrinsic::aarch64_sve_asrd:
21163 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
21164 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21165 case Intrinsic::aarch64_sve_cmphs:
21166 if (!N->getOperand(2).getValueType().isFloatingPoint())
21168 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21169 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
21170 break;
21171 case Intrinsic::aarch64_sve_cmphi:
21172 if (!N->getOperand(2).getValueType().isFloatingPoint())
21174 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21175 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
21176 break;
21177 case Intrinsic::aarch64_sve_fcmpge:
21178 case Intrinsic::aarch64_sve_cmpge:
21180 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21181 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
21182 break;
21183 case Intrinsic::aarch64_sve_fcmpgt:
21184 case Intrinsic::aarch64_sve_cmpgt:
21186 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21187 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
21188 break;
21189 case Intrinsic::aarch64_sve_fcmpeq:
21190 case Intrinsic::aarch64_sve_cmpeq:
21192 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21193 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
21194 break;
21195 case Intrinsic::aarch64_sve_fcmpne:
21196 case Intrinsic::aarch64_sve_cmpne:
21198 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21199 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
21200 break;
21201 case Intrinsic::aarch64_sve_fcmpuo:
21203 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21204 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
21205 break;
21206 case Intrinsic::aarch64_sve_fadda:
21208 case Intrinsic::aarch64_sve_faddv:
21210 case Intrinsic::aarch64_sve_fmaxnmv:
21212 case Intrinsic::aarch64_sve_fmaxv:
21214 case Intrinsic::aarch64_sve_fminnmv:
21216 case Intrinsic::aarch64_sve_fminv:
21218 case Intrinsic::aarch64_sve_sel:
21219 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
21220 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21221 case Intrinsic::aarch64_sve_cmpeq_wide:
21222 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
21223 case Intrinsic::aarch64_sve_cmpne_wide:
21224 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
21225 case Intrinsic::aarch64_sve_cmpge_wide:
21226 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
21227 case Intrinsic::aarch64_sve_cmpgt_wide:
21228 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
21229 case Intrinsic::aarch64_sve_cmplt_wide:
21230 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
21231 case Intrinsic::aarch64_sve_cmple_wide:
21232 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
21233 case Intrinsic::aarch64_sve_cmphs_wide:
21234 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
21235 case Intrinsic::aarch64_sve_cmphi_wide:
21236 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
21237 case Intrinsic::aarch64_sve_cmplo_wide:
21238 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
21239 case Intrinsic::aarch64_sve_cmpls_wide:
21240 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
21241 case Intrinsic::aarch64_sve_ptest_any:
21242 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21244 case Intrinsic::aarch64_sve_ptest_first:
21245 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21247 case Intrinsic::aarch64_sve_ptest_last:
21248 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21250 case Intrinsic::aarch64_sve_whilelo:
21251 return tryCombineWhileLo(N, DCI, Subtarget);
21252 }
21253 return SDValue();
21254}
21255
21256static bool isCheapToExtend(const SDValue &N) {
21257 unsigned OC = N->getOpcode();
21258 return OC == ISD::LOAD || OC == ISD::MLOAD ||
21260}
21261
21262static SDValue
21264 SelectionDAG &DAG) {
21265 // If we have (sext (setcc A B)) and A and B are cheap to extend,
21266 // we can move the sext into the arguments and have the same result. For
21267 // example, if A and B are both loads, we can make those extending loads and
21268 // avoid an extra instruction. This pattern appears often in VLS code
21269 // generation where the inputs to the setcc have a different size to the
21270 // instruction that wants to use the result of the setcc.
21271 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
21272 N->getOperand(0)->getOpcode() == ISD::SETCC);
21273 const SDValue SetCC = N->getOperand(0);
21274
21275 const SDValue CCOp0 = SetCC.getOperand(0);
21276 const SDValue CCOp1 = SetCC.getOperand(1);
21277 if (!CCOp0->getValueType(0).isInteger() ||
21278 !CCOp1->getValueType(0).isInteger())
21279 return SDValue();
21280
21281 ISD::CondCode Code =
21282 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
21283
21284 ISD::NodeType ExtType =
21285 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21286
21287 if (isCheapToExtend(SetCC.getOperand(0)) &&
21288 isCheapToExtend(SetCC.getOperand(1))) {
21289 const SDValue Ext1 =
21290 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
21291 const SDValue Ext2 =
21292 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
21293
21294 return DAG.getSetCC(
21295 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
21296 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
21297 }
21298
21299 return SDValue();
21300}
21301
21304 SelectionDAG &DAG) {
21305 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
21306 // we can convert that DUP into another extract_high (of a bigger DUP), which
21307 // helps the backend to decide that an sabdl2 would be useful, saving a real
21308 // extract_high operation.
21309 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
21310 (N->getOperand(0).getOpcode() == ISD::ABDU ||
21311 N->getOperand(0).getOpcode() == ISD::ABDS)) {
21312 SDNode *ABDNode = N->getOperand(0).getNode();
21313 SDValue NewABD =
21315 if (!NewABD.getNode())
21316 return SDValue();
21317
21318 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
21319 }
21320
21321 if (N->getValueType(0).isFixedLengthVector() &&
21322 N->getOpcode() == ISD::SIGN_EXTEND &&
21323 N->getOperand(0)->getOpcode() == ISD::SETCC)
21324 return performSignExtendSetCCCombine(N, DCI, DAG);
21325
21326 return SDValue();
21327}
21328
21330 SDValue SplatVal, unsigned NumVecElts) {
21331 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
21332 Align OrigAlignment = St.getAlign();
21333 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
21334
21335 // Create scalar stores. This is at least as good as the code sequence for a
21336 // split unaligned store which is a dup.s, ext.b, and two stores.
21337 // Most of the time the three stores should be replaced by store pair
21338 // instructions (stp).
21339 SDLoc DL(&St);
21340 SDValue BasePtr = St.getBasePtr();
21341 uint64_t BaseOffset = 0;
21342
21343 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
21344 SDValue NewST1 =
21345 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
21346 OrigAlignment, St.getMemOperand()->getFlags());
21347
21348 // As this in ISel, we will not merge this add which may degrade results.
21349 if (BasePtr->getOpcode() == ISD::ADD &&
21350 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
21351 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
21352 BasePtr = BasePtr->getOperand(0);
21353 }
21354
21355 unsigned Offset = EltOffset;
21356 while (--NumVecElts) {
21357 Align Alignment = commonAlignment(OrigAlignment, Offset);
21358 SDValue OffsetPtr =
21359 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21360 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
21361 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
21362 PtrInfo.getWithOffset(Offset), Alignment,
21363 St.getMemOperand()->getFlags());
21364 Offset += EltOffset;
21365 }
21366 return NewST1;
21367}
21368
21369// Returns an SVE type that ContentTy can be trivially sign or zero extended
21370// into.
21371static MVT getSVEContainerType(EVT ContentTy) {
21372 assert(ContentTy.isSimple() && "No SVE containers for extended types");
21373
21374 switch (ContentTy.getSimpleVT().SimpleTy) {
21375 default:
21376 llvm_unreachable("No known SVE container for this MVT type");
21377 case MVT::nxv2i8:
21378 case MVT::nxv2i16:
21379 case MVT::nxv2i32:
21380 case MVT::nxv2i64:
21381 case MVT::nxv2f32:
21382 case MVT::nxv2f64:
21383 return MVT::nxv2i64;
21384 case MVT::nxv4i8:
21385 case MVT::nxv4i16:
21386 case MVT::nxv4i32:
21387 case MVT::nxv4f32:
21388 return MVT::nxv4i32;
21389 case MVT::nxv8i8:
21390 case MVT::nxv8i16:
21391 case MVT::nxv8f16:
21392 case MVT::nxv8bf16:
21393 return MVT::nxv8i16;
21394 case MVT::nxv16i8:
21395 return MVT::nxv16i8;
21396 }
21397}
21398
21399static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21400 SDLoc DL(N);
21401 EVT VT = N->getValueType(0);
21402
21404 return SDValue();
21405
21406 EVT ContainerVT = VT;
21407 if (ContainerVT.isInteger())
21408 ContainerVT = getSVEContainerType(ContainerVT);
21409
21410 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21411 SDValue Ops[] = { N->getOperand(0), // Chain
21412 N->getOperand(2), // Pg
21413 N->getOperand(3), // Base
21414 DAG.getValueType(VT) };
21415
21416 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21417 SDValue LoadChain = SDValue(Load.getNode(), 1);
21418
21419 if (ContainerVT.isInteger() && (VT != ContainerVT))
21420 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21421
21422 return DAG.getMergeValues({ Load, LoadChain }, DL);
21423}
21424
21426 SDLoc DL(N);
21427 EVT VT = N->getValueType(0);
21428 EVT PtrTy = N->getOperand(3).getValueType();
21429
21430 EVT LoadVT = VT;
21431 if (VT.isFloatingPoint())
21432 LoadVT = VT.changeTypeToInteger();
21433
21434 auto *MINode = cast<MemIntrinsicSDNode>(N);
21435 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21436 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21437 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21438 MINode->getOperand(2), PassThru,
21439 MINode->getMemoryVT(), MINode->getMemOperand(),
21441
21442 if (VT.isFloatingPoint()) {
21443 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21444 return DAG.getMergeValues(Ops, DL);
21445 }
21446
21447 return L;
21448}
21449
21450template <unsigned Opcode>
21452 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21454 "Unsupported opcode.");
21455 SDLoc DL(N);
21456 EVT VT = N->getValueType(0);
21457
21458 EVT LoadVT = VT;
21459 if (VT.isFloatingPoint())
21460 LoadVT = VT.changeTypeToInteger();
21461
21462 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21463 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21464 SDValue LoadChain = SDValue(Load.getNode(), 1);
21465
21466 if (VT.isFloatingPoint())
21467 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21468
21469 return DAG.getMergeValues({Load, LoadChain}, DL);
21470}
21471
21473 SDLoc DL(N);
21474 SDValue Data = N->getOperand(2);
21475 EVT DataVT = Data.getValueType();
21476 EVT HwSrcVt = getSVEContainerType(DataVT);
21477 SDValue InputVT = DAG.getValueType(DataVT);
21478
21479 if (DataVT.isFloatingPoint())
21480 InputVT = DAG.getValueType(HwSrcVt);
21481
21482 SDValue SrcNew;
21483 if (Data.getValueType().isFloatingPoint())
21484 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21485 else
21486 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21487
21488 SDValue Ops[] = { N->getOperand(0), // Chain
21489 SrcNew,
21490 N->getOperand(4), // Base
21491 N->getOperand(3), // Pg
21492 InputVT
21493 };
21494
21495 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21496}
21497
21499 SDLoc DL(N);
21500
21501 SDValue Data = N->getOperand(2);
21502 EVT DataVT = Data.getValueType();
21503 EVT PtrTy = N->getOperand(4).getValueType();
21504
21505 if (DataVT.isFloatingPoint())
21507
21508 auto *MINode = cast<MemIntrinsicSDNode>(N);
21509 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21510 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21511 MINode->getMemoryVT(), MINode->getMemOperand(),
21512 ISD::UNINDEXED, false, false);
21513}
21514
21515/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21516/// load store optimizer pass will merge them to store pair stores. This should
21517/// be better than a movi to create the vector zero followed by a vector store
21518/// if the zero constant is not re-used, since one instructions and one register
21519/// live range will be removed.
21520///
21521/// For example, the final generated code should be:
21522///
21523/// stp xzr, xzr, [x0]
21524///
21525/// instead of:
21526///
21527/// movi v0.2d, #0
21528/// str q0, [x0]
21529///
21531 SDValue StVal = St.getValue();
21532 EVT VT = StVal.getValueType();
21533
21534 // Avoid scalarizing zero splat stores for scalable vectors.
21535 if (VT.isScalableVector())
21536 return SDValue();
21537
21538 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21539 // 2, 3 or 4 i32 elements.
21540 int NumVecElts = VT.getVectorNumElements();
21541 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21542 VT.getVectorElementType().getSizeInBits() == 64) ||
21543 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21544 VT.getVectorElementType().getSizeInBits() == 32)))
21545 return SDValue();
21546
21547 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21548 return SDValue();
21549
21550 // If the zero constant has more than one use then the vector store could be
21551 // better since the constant mov will be amortized and stp q instructions
21552 // should be able to be formed.
21553 if (!StVal.hasOneUse())
21554 return SDValue();
21555
21556 // If the store is truncating then it's going down to i16 or smaller, which
21557 // means it can be implemented in a single store anyway.
21558 if (St.isTruncatingStore())
21559 return SDValue();
21560
21561 // If the immediate offset of the address operand is too large for the stp
21562 // instruction, then bail out.
21563 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21564 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21565 if (Offset < -512 || Offset > 504)
21566 return SDValue();
21567 }
21568
21569 for (int I = 0; I < NumVecElts; ++I) {
21570 SDValue EltVal = StVal.getOperand(I);
21571 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21572 return SDValue();
21573 }
21574
21575 // Use a CopyFromReg WZR/XZR here to prevent
21576 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21577 SDLoc DL(&St);
21578 unsigned ZeroReg;
21579 EVT ZeroVT;
21580 if (VT.getVectorElementType().getSizeInBits() == 32) {
21581 ZeroReg = AArch64::WZR;
21582 ZeroVT = MVT::i32;
21583 } else {
21584 ZeroReg = AArch64::XZR;
21585 ZeroVT = MVT::i64;
21586 }
21587 SDValue SplatVal =
21588 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21589 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21590}
21591
21592/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21593/// value. The load store optimizer pass will merge them to store pair stores.
21594/// This has better performance than a splat of the scalar followed by a split
21595/// vector store. Even if the stores are not merged it is four stores vs a dup,
21596/// followed by an ext.b and two stores.
21598 SDValue StVal = St.getValue();
21599 EVT VT = StVal.getValueType();
21600
21601 // Don't replace floating point stores, they possibly won't be transformed to
21602 // stp because of the store pair suppress pass.
21603 if (VT.isFloatingPoint())
21604 return SDValue();
21605
21606 // We can express a splat as store pair(s) for 2 or 4 elements.
21607 unsigned NumVecElts = VT.getVectorNumElements();
21608 if (NumVecElts != 4 && NumVecElts != 2)
21609 return SDValue();
21610
21611 // If the store is truncating then it's going down to i16 or smaller, which
21612 // means it can be implemented in a single store anyway.
21613 if (St.isTruncatingStore())
21614 return SDValue();
21615
21616 // Check that this is a splat.
21617 // Make sure that each of the relevant vector element locations are inserted
21618 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21619 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21620 SDValue SplatVal;
21621 for (unsigned I = 0; I < NumVecElts; ++I) {
21622 // Check for insert vector elements.
21623 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21624 return SDValue();
21625
21626 // Check that same value is inserted at each vector element.
21627 if (I == 0)
21628 SplatVal = StVal.getOperand(1);
21629 else if (StVal.getOperand(1) != SplatVal)
21630 return SDValue();
21631
21632 // Check insert element index.
21633 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21634 if (!CIndex)
21635 return SDValue();
21636 uint64_t IndexVal = CIndex->getZExtValue();
21637 if (IndexVal >= NumVecElts)
21638 return SDValue();
21639 IndexNotInserted.reset(IndexVal);
21640
21641 StVal = StVal.getOperand(0);
21642 }
21643 // Check that all vector element locations were inserted to.
21644 if (IndexNotInserted.any())
21645 return SDValue();
21646
21647 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21648}
21649
21651 SelectionDAG &DAG,
21652 const AArch64Subtarget *Subtarget) {
21653
21654 StoreSDNode *S = cast<StoreSDNode>(N);
21655 if (S->isVolatile() || S->isIndexed())
21656 return SDValue();
21657
21658 SDValue StVal = S->getValue();
21659 EVT VT = StVal.getValueType();
21660
21661 if (!VT.isFixedLengthVector())
21662 return SDValue();
21663
21664 // If we get a splat of zeros, convert this vector store to a store of
21665 // scalars. They will be merged into store pairs of xzr thereby removing one
21666 // instruction and one register.
21667 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21668 return ReplacedZeroSplat;
21669
21670 // FIXME: The logic for deciding if an unaligned store should be split should
21671 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21672 // a call to that function here.
21673
21674 if (!Subtarget->isMisaligned128StoreSlow())
21675 return SDValue();
21676
21677 // Don't split at -Oz.
21679 return SDValue();
21680
21681 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21682 // those up regresses performance on micro-benchmarks and olden/bh.
21683 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21684 return SDValue();
21685
21686 // Split unaligned 16B stores. They are terrible for performance.
21687 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21688 // extensions can use this to mark that it does not want splitting to happen
21689 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21690 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21691 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21692 S->getAlign() <= Align(2))
21693 return SDValue();
21694
21695 // If we get a splat of a scalar convert this vector store to a store of
21696 // scalars. They will be merged into store pairs thereby removing two
21697 // instructions.
21698 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21699 return ReplacedSplat;
21700
21701 SDLoc DL(S);
21702
21703 // Split VT into two.
21704 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21705 unsigned NumElts = HalfVT.getVectorNumElements();
21706 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21707 DAG.getConstant(0, DL, MVT::i64));
21708 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21709 DAG.getConstant(NumElts, DL, MVT::i64));
21710 SDValue BasePtr = S->getBasePtr();
21711 SDValue NewST1 =
21712 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21713 S->getAlign(), S->getMemOperand()->getFlags());
21714 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21715 DAG.getConstant(8, DL, MVT::i64));
21716 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21717 S->getPointerInfo(), S->getAlign(),
21718 S->getMemOperand()->getFlags());
21719}
21720
21722 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21723
21724 // splice(pg, op1, undef) -> op1
21725 if (N->getOperand(2).isUndef())
21726 return N->getOperand(1);
21727
21728 return SDValue();
21729}
21730
21732 const AArch64Subtarget *Subtarget) {
21733 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21734 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21735 "Unexpected Opcode!");
21736
21737 // uunpklo/hi undef -> undef
21738 if (N->getOperand(0).isUndef())
21739 return DAG.getUNDEF(N->getValueType(0));
21740
21741 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21742 // extending load. We can do this even if this is already a masked
21743 // {z,}extload.
21744 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21745 N->getOpcode() == AArch64ISD::UUNPKLO) {
21746 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21747 SDValue Mask = MLD->getMask();
21748 SDLoc DL(N);
21749
21750 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21751 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21752 (MLD->getPassThru()->isUndef() ||
21753 isZerosVector(MLD->getPassThru().getNode()))) {
21754 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21755 unsigned PgPattern = Mask->getConstantOperandVal(0);
21756 EVT VT = N->getValueType(0);
21757
21758 // Ensure we can double the size of the predicate pattern
21759 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21760 if (NumElts &&
21761 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21762 Mask =
21763 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21764 SDValue PassThru = DAG.getConstant(0, DL, VT);
21765 SDValue NewLoad = DAG.getMaskedLoad(
21766 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21767 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21769
21770 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21771
21772 return NewLoad;
21773 }
21774 }
21775 }
21776
21777 return SDValue();
21778}
21779
21781 if (N->getOpcode() != AArch64ISD::UZP1)
21782 return false;
21783 SDValue Op0 = N->getOperand(0);
21784 EVT SrcVT = Op0->getValueType(0);
21785 EVT DstVT = N->getValueType(0);
21786 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21787 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21788 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21789}
21790
21791// Try to combine rounding shifts where the operands come from an extend, and
21792// the result is truncated and combined into one vector.
21793// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21795 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21796 SDValue Op0 = N->getOperand(0);
21797 SDValue Op1 = N->getOperand(1);
21798 EVT ResVT = N->getValueType(0);
21799
21800 unsigned RshOpc = Op0.getOpcode();
21801 if (RshOpc != AArch64ISD::RSHRNB_I)
21802 return SDValue();
21803
21804 // Same op code and imm value?
21805 SDValue ShiftValue = Op0.getOperand(1);
21806 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21807 return SDValue();
21808
21809 // Same unextended operand value?
21810 SDValue Lo = Op0.getOperand(0);
21811 SDValue Hi = Op1.getOperand(0);
21812 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21813 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21814 return SDValue();
21815 SDValue OrigArg = Lo.getOperand(0);
21816 if (OrigArg != Hi.getOperand(0))
21817 return SDValue();
21818
21819 SDLoc DL(N);
21820 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21821 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21822 ShiftValue);
21823}
21824
21825// Try to simplify:
21826// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21827// t2 = nxv8i16 srl(t1, ShiftValue)
21828// to
21829// t1 = nxv8i16 rshrnb(X, shiftvalue).
21830// rshrnb will zero the top half bits of each element. Therefore, this combine
21831// should only be performed when a following instruction with the rshrnb
21832// as an operand does not care about the top half of each element. For example,
21833// a uzp1 or a truncating store.
21835 const AArch64Subtarget *Subtarget) {
21836 EVT VT = Srl->getValueType(0);
21837 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21838 return SDValue();
21839
21840 EVT ResVT;
21841 if (VT == MVT::nxv8i16)
21842 ResVT = MVT::nxv16i8;
21843 else if (VT == MVT::nxv4i32)
21844 ResVT = MVT::nxv8i16;
21845 else if (VT == MVT::nxv2i64)
21846 ResVT = MVT::nxv4i32;
21847 else
21848 return SDValue();
21849
21850 SDLoc DL(Srl);
21851 unsigned ShiftValue;
21852 SDValue RShOperand;
21853 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21854 return SDValue();
21855 SDValue Rshrnb = DAG.getNode(
21856 AArch64ISD::RSHRNB_I, DL, ResVT,
21857 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21858 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21859}
21860
21862 const AArch64Subtarget *Subtarget) {
21863 SDLoc DL(N);
21864 SDValue Op0 = N->getOperand(0);
21865 SDValue Op1 = N->getOperand(1);
21866 EVT ResVT = N->getValueType(0);
21867
21868 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
21869 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21871 Op0.getOperand(0) == Op1.getOperand(0)) {
21872
21873 SDValue SourceVec = Op0.getOperand(0);
21874 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
21875 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
21876 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
21877 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
21878 EVT OpVT = Op0.getOperand(1).getValueType();
21879 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21880 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
21881 DAG.getUNDEF(WidenedResVT));
21882 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
21883 DAG.getConstant(0, DL, OpVT));
21884 }
21885 }
21886
21887 // Following optimizations only work with uzp1.
21888 if (N->getOpcode() == AArch64ISD::UZP2)
21889 return SDValue();
21890
21891 // uzp1(x, undef) -> concat(truncate(x), undef)
21892 if (Op1.getOpcode() == ISD::UNDEF) {
21893 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21894 switch (ResVT.getSimpleVT().SimpleTy) {
21895 default:
21896 break;
21897 case MVT::v16i8:
21898 BCVT = MVT::v8i16;
21899 HalfVT = MVT::v8i8;
21900 break;
21901 case MVT::v8i16:
21902 BCVT = MVT::v4i32;
21903 HalfVT = MVT::v4i16;
21904 break;
21905 case MVT::v4i32:
21906 BCVT = MVT::v2i64;
21907 HalfVT = MVT::v2i32;
21908 break;
21909 }
21910 if (BCVT != MVT::Other) {
21911 SDValue BC = DAG.getBitcast(BCVT, Op0);
21912 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21913 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21914 DAG.getUNDEF(HalfVT));
21915 }
21916 }
21917
21918 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21919 return Urshr;
21920
21921 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21922 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21923
21924 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21925 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21926
21927 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21928 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21929 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21930 SDValue X = Op0.getOperand(0).getOperand(0);
21931 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21932 }
21933 }
21934
21935 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21936 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21937 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21938 SDValue Z = Op1.getOperand(0).getOperand(1);
21939 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21940 }
21941 }
21942
21943 // These optimizations only work on little endian.
21944 if (!DAG.getDataLayout().isLittleEndian())
21945 return SDValue();
21946
21947 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21948 // Example:
21949 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21950 // to
21951 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21953 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21954 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21955 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21956 Op1.getOperand(0));
21957 }
21958 }
21959
21960 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21961 return SDValue();
21962
21963 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21964 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21965
21966 // truncating uzp1(x, y) -> xtn(concat (x, y))
21967 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21968 EVT Op0Ty = SourceOp0.getValueType();
21969 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21970 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21971 SDValue Concat =
21974 SourceOp0, SourceOp1);
21975 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21976 }
21977 }
21978
21979 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21980 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21981 SourceOp1.getOpcode() != ISD::TRUNCATE)
21982 return SDValue();
21983 SourceOp0 = SourceOp0.getOperand(0);
21984 SourceOp1 = SourceOp1.getOperand(0);
21985
21986 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21987 !SourceOp0.getValueType().isSimple())
21988 return SDValue();
21989
21990 EVT ResultTy;
21991
21992 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21993 case MVT::v2i64:
21994 ResultTy = MVT::v4i32;
21995 break;
21996 case MVT::v4i32:
21997 ResultTy = MVT::v8i16;
21998 break;
21999 case MVT::v8i16:
22000 ResultTy = MVT::v16i8;
22001 break;
22002 default:
22003 return SDValue();
22004 }
22005
22006 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
22007 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
22008 SDValue UzpResult =
22009 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
22010
22011 EVT BitcastResultTy;
22012
22013 switch (ResVT.getSimpleVT().SimpleTy) {
22014 case MVT::v2i32:
22015 BitcastResultTy = MVT::v2i64;
22016 break;
22017 case MVT::v4i16:
22018 BitcastResultTy = MVT::v4i32;
22019 break;
22020 case MVT::v8i8:
22021 BitcastResultTy = MVT::v8i16;
22022 break;
22023 default:
22024 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
22025 }
22026
22027 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
22028 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
22029}
22030
22032 unsigned Opc = N->getOpcode();
22033
22034 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
22036 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
22038 "Invalid opcode.");
22039
22040 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
22042 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
22044 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
22048
22049 SDLoc DL(N);
22050 SDValue Chain = N->getOperand(0);
22051 SDValue Pg = N->getOperand(1);
22052 SDValue Base = N->getOperand(2);
22053 SDValue Offset = N->getOperand(3);
22054 SDValue Ty = N->getOperand(4);
22055
22056 EVT ResVT = N->getValueType(0);
22057
22058 const auto OffsetOpc = Offset.getOpcode();
22059 const bool OffsetIsZExt =
22061 const bool OffsetIsSExt =
22063
22064 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
22065 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
22066 SDValue ExtPg = Offset.getOperand(0);
22067 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
22068 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
22069
22070 // If the predicate for the sign- or zero-extended offset is the
22071 // same as the predicate used for this load and the sign-/zero-extension
22072 // was from a 32-bits...
22073 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
22074 SDValue UnextendedOffset = Offset.getOperand(1);
22075
22076 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
22077 if (Signed)
22078 NewOpc = getSignExtendedGatherOpcode(NewOpc);
22079
22080 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
22081 {Chain, Pg, Base, UnextendedOffset, Ty});
22082 }
22083 }
22084
22085 return SDValue();
22086}
22087
22088/// Optimize a vector shift instruction and its operand if shifted out
22089/// bits are not used.
22091 const AArch64TargetLowering &TLI,
22093 assert(N->getOpcode() == AArch64ISD::VASHR ||
22094 N->getOpcode() == AArch64ISD::VLSHR);
22095
22096 SDValue Op = N->getOperand(0);
22097 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
22098
22099 unsigned ShiftImm = N->getConstantOperandVal(1);
22100 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
22101
22102 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
22103 if (N->getOpcode() == AArch64ISD::VASHR &&
22104 Op.getOpcode() == AArch64ISD::VSHL &&
22105 N->getOperand(1) == Op.getOperand(1))
22106 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
22107 return Op.getOperand(0);
22108
22109 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
22110 APInt DemandedMask = ~ShiftedOutBits;
22111
22112 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
22113 return SDValue(N, 0);
22114
22115 return SDValue();
22116}
22117
22119 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
22120 // This transform works in partnership with performSetCCPunpkCombine to
22121 // remove unnecessary transfer of predicates into standard registers and back
22122 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
22123 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
22124 MVT::i1) {
22125 SDValue CC = N->getOperand(0)->getOperand(0);
22126 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
22127 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
22128 DAG.getVectorIdxConstant(0, SDLoc(N)));
22129 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
22130 }
22131
22132 return SDValue();
22133}
22134
22135/// Target-specific DAG combine function for post-increment LD1 (lane) and
22136/// post-increment LD1R.
22139 bool IsLaneOp) {
22140 if (DCI.isBeforeLegalizeOps())
22141 return SDValue();
22142
22143 SelectionDAG &DAG = DCI.DAG;
22144 EVT VT = N->getValueType(0);
22145
22146 if (!VT.is128BitVector() && !VT.is64BitVector())
22147 return SDValue();
22148
22149 unsigned LoadIdx = IsLaneOp ? 1 : 0;
22150 SDNode *LD = N->getOperand(LoadIdx).getNode();
22151 // If it is not LOAD, can not do such combine.
22152 if (LD->getOpcode() != ISD::LOAD)
22153 return SDValue();
22154
22155 // The vector lane must be a constant in the LD1LANE opcode.
22156 SDValue Lane;
22157 if (IsLaneOp) {
22158 Lane = N->getOperand(2);
22159 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
22160 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
22161 return SDValue();
22162 }
22163
22164 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
22165 EVT MemVT = LoadSDN->getMemoryVT();
22166 // Check if memory operand is the same type as the vector element.
22167 if (MemVT != VT.getVectorElementType())
22168 return SDValue();
22169
22170 // Check if there are other uses. If so, do not combine as it will introduce
22171 // an extra load.
22172 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
22173 ++UI) {
22174 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
22175 continue;
22176 if (*UI != N)
22177 return SDValue();
22178 }
22179
22180 // If there is one use and it can splat the value, prefer that operation.
22181 // TODO: This could be expanded to more operations if they reliably use the
22182 // index variants.
22183 if (N->hasOneUse()) {
22184 unsigned UseOpc = N->use_begin()->getOpcode();
22185 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
22186 return SDValue();
22187 }
22188
22189 SDValue Addr = LD->getOperand(1);
22190 SDValue Vector = N->getOperand(0);
22191 // Search for a use of the address operand that is an increment.
22192 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
22193 Addr.getNode()->use_end(); UI != UE; ++UI) {
22194 SDNode *User = *UI;
22195 if (User->getOpcode() != ISD::ADD
22196 || UI.getUse().getResNo() != Addr.getResNo())
22197 continue;
22198
22199 // If the increment is a constant, it must match the memory ref size.
22200 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22201 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22202 uint32_t IncVal = CInc->getZExtValue();
22203 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
22204 if (IncVal != NumBytes)
22205 continue;
22206 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22207 }
22208
22209 // To avoid cycle construction make sure that neither the load nor the add
22210 // are predecessors to each other or the Vector.
22213 Visited.insert(Addr.getNode());
22214 Worklist.push_back(User);
22215 Worklist.push_back(LD);
22216 Worklist.push_back(Vector.getNode());
22217 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
22218 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22219 continue;
22220
22222 Ops.push_back(LD->getOperand(0)); // Chain
22223 if (IsLaneOp) {
22224 Ops.push_back(Vector); // The vector to be inserted
22225 Ops.push_back(Lane); // The lane to be inserted in the vector
22226 }
22227 Ops.push_back(Addr);
22228 Ops.push_back(Inc);
22229
22230 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
22231 SDVTList SDTys = DAG.getVTList(Tys);
22232 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
22233 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
22234 MemVT,
22235 LoadSDN->getMemOperand());
22236
22237 // Update the uses.
22238 SDValue NewResults[] = {
22239 SDValue(LD, 0), // The result of load
22240 SDValue(UpdN.getNode(), 2) // Chain
22241 };
22242 DCI.CombineTo(LD, NewResults);
22243 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
22244 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
22245
22246 break;
22247 }
22248 return SDValue();
22249}
22250
22251/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
22252/// address translation.
22255 SelectionDAG &DAG) {
22256 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
22257 KnownBits Known;
22259 !DCI.isBeforeLegalizeOps());
22260 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22261 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
22262 DCI.CommitTargetLoweringOpt(TLO);
22263 return true;
22264 }
22265 return false;
22266}
22267
22269 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
22270 "Expected STORE dag node in input!");
22271
22272 if (auto Store = dyn_cast<StoreSDNode>(N)) {
22273 if (!Store->isTruncatingStore() || Store->isIndexed())
22274 return SDValue();
22275 SDValue Ext = Store->getValue();
22276 auto ExtOpCode = Ext.getOpcode();
22277 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
22278 ExtOpCode != ISD::ANY_EXTEND)
22279 return SDValue();
22280 SDValue Orig = Ext->getOperand(0);
22281 if (Store->getMemoryVT() != Orig.getValueType())
22282 return SDValue();
22283 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
22284 Store->getBasePtr(), Store->getMemOperand());
22285 }
22286
22287 return SDValue();
22288}
22289
22290// A custom combine to lower load <3 x i8> as the more efficient sequence
22291// below:
22292// ldrb wX, [x0, #2]
22293// ldrh wY, [x0]
22294// orr wX, wY, wX, lsl #16
22295// fmov s0, wX
22296//
22297// Note that an alternative sequence with even fewer (although usually more
22298// complex/expensive) instructions would be:
22299// ld1r.4h { v0 }, [x0], #2
22300// ld1.b { v0 }[2], [x0]
22301//
22302// Generating this sequence unfortunately results in noticeably worse codegen
22303// for code that extends the loaded v3i8, due to legalization breaking vector
22304// shuffle detection in a way that is very difficult to work around.
22305// TODO: Revisit once v3i8 legalization has been improved in general.
22307 EVT MemVT = LD->getMemoryVT();
22308 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
22309 LD->getOriginalAlign() >= 4)
22310 return SDValue();
22311
22312 SDLoc DL(LD);
22314 SDValue Chain = LD->getChain();
22315 SDValue BasePtr = LD->getBasePtr();
22316 MachineMemOperand *MMO = LD->getMemOperand();
22317 assert(LD->getOffset().isUndef() && "undef offset expected");
22318
22319 // Load 2 x i8, then 1 x i8.
22320 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
22321 TypeSize Offset2 = TypeSize::getFixed(2);
22322 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
22323 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
22324 MF.getMachineMemOperand(MMO, 2, 1));
22325
22326 // Extend to i32.
22327 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
22328 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
22329
22330 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
22331 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
22332 DAG.getConstant(16, DL, MVT::i32));
22333 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
22334 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
22335
22336 // Extract v3i8 again.
22337 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
22338 DAG.getConstant(0, DL, MVT::i64));
22339 SDValue TokenFactor = DAG.getNode(
22340 ISD::TokenFactor, DL, MVT::Other,
22341 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
22342 return DAG.getMergeValues({Extract, TokenFactor}, DL);
22343}
22344
22345// Perform TBI simplification if supported by the target and try to break up
22346// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
22347// load instructions can be selected.
22350 SelectionDAG &DAG,
22351 const AArch64Subtarget *Subtarget) {
22352 if (Subtarget->supportsAddressTopByteIgnored())
22353 performTBISimplification(N->getOperand(1), DCI, DAG);
22354
22355 LoadSDNode *LD = cast<LoadSDNode>(N);
22356 if (LD->isVolatile() || !Subtarget->isLittleEndian())
22357 return SDValue(N, 0);
22358
22359 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
22360 return Res;
22361
22362 if (!LD->isNonTemporal())
22363 return SDValue(N, 0);
22364
22365 EVT MemVT = LD->getMemoryVT();
22366 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
22367 MemVT.getSizeInBits() % 256 == 0 ||
22368 256 % MemVT.getScalarSizeInBits() != 0)
22369 return SDValue(N, 0);
22370
22371 SDLoc DL(LD);
22372 SDValue Chain = LD->getChain();
22373 SDValue BasePtr = LD->getBasePtr();
22374 SDNodeFlags Flags = LD->getFlags();
22376 SmallVector<SDValue, 4> LoadOpsChain;
22377 // Replace any non temporal load over 256-bit with a series of 256 bit loads
22378 // and a scalar/vector load less than 256. This way we can utilize 256-bit
22379 // loads and reduce the amount of load instructions generated.
22380 MVT NewVT =
22382 256 / MemVT.getVectorElementType().getSizeInBits());
22383 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
22384 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
22385 for (unsigned I = 0; I < Num256Loads; I++) {
22386 unsigned PtrOffset = I * 32;
22387 SDValue NewPtr = DAG.getMemBasePlusOffset(
22388 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22389 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22390 SDValue NewLoad = DAG.getLoad(
22391 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
22392 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
22393 LoadOps.push_back(NewLoad);
22394 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
22395 }
22396
22397 // Process remaining bits of the load operation.
22398 // This is done by creating an UNDEF vector to match the size of the
22399 // 256-bit loads and inserting the remaining load to it. We extract the
22400 // original load type at the end using EXTRACT_SUBVECTOR instruction.
22401 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
22402 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
22403 MVT RemainingVT = MVT::getVectorVT(
22405 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22406 SDValue NewPtr = DAG.getMemBasePlusOffset(
22407 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22408 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22409 SDValue RemainingLoad =
22410 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
22411 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
22412 LD->getMemOperand()->getFlags(), LD->getAAInfo());
22413 SDValue UndefVector = DAG.getUNDEF(NewVT);
22414 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
22415 SDValue ExtendedReminingLoad =
22416 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
22417 {UndefVector, RemainingLoad, InsertIdx});
22418 LoadOps.push_back(ExtendedReminingLoad);
22419 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
22420 EVT ConcatVT =
22422 LoadOps.size() * NewVT.getVectorNumElements());
22423 SDValue ConcatVectors =
22424 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
22425 // Extract the original vector type size.
22426 SDValue ExtractSubVector =
22427 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
22428 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
22429 SDValue TokenFactor =
22430 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
22431 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
22432}
22433
22435 EVT VecVT = Op.getValueType();
22436 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22437 "Need boolean vector type.");
22438
22439 if (Depth > 3)
22441
22442 // We can get the base type from a vector compare or truncate.
22443 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22444 return Op.getOperand(0).getValueType();
22445
22446 // If an operand is a bool vector, continue looking.
22448 for (SDValue Operand : Op->op_values()) {
22449 if (Operand.getValueType() != VecVT)
22450 continue;
22451
22452 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22453 if (!BaseVT.isSimple())
22454 BaseVT = OperandVT;
22455 else if (OperandVT != BaseVT)
22457 }
22458
22459 return BaseVT;
22460}
22461
22462// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22463// iN, we can use a trick that extracts the i^th bit from the i^th element and
22464// then performs a vector add to get a scalar bitmask. This requires that each
22465// element's bits are either all 1 or all 0.
22467 SDLoc DL(N);
22468 SDValue ComparisonResult(N, 0);
22469 EVT VecVT = ComparisonResult.getValueType();
22470 assert(VecVT.isVector() && "Must be a vector type");
22471
22472 unsigned NumElts = VecVT.getVectorNumElements();
22473 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22474 return SDValue();
22475
22476 if (VecVT.getVectorElementType() != MVT::i1 &&
22477 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22478 return SDValue();
22479
22480 // If we can find the original types to work on instead of a vector of i1,
22481 // we can avoid extend/extract conversion instructions.
22482 if (VecVT.getVectorElementType() == MVT::i1) {
22483 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22484 if (!VecVT.isSimple()) {
22485 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22486 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22487 }
22488 }
22489 VecVT = VecVT.changeVectorElementTypeToInteger();
22490
22491 // Large vectors don't map directly to this conversion, so to avoid too many
22492 // edge cases, we don't apply it here. The conversion will likely still be
22493 // applied later via multiple smaller vectors, whose results are concatenated.
22494 if (VecVT.getSizeInBits() > 128)
22495 return SDValue();
22496
22497 // Ensure that all elements' bits are either 0s or 1s.
22498 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22499
22500 SmallVector<SDValue, 16> MaskConstants;
22502 VecVT == MVT::v16i8) {
22503 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22504 // per entry. We split it into two halves, apply the mask, zip the halves to
22505 // create 8x 16-bit values, and the perform the vector reduce.
22506 for (unsigned Half = 0; Half < 2; ++Half) {
22507 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22508 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22509 }
22510 }
22511 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22512 SDValue RepresentativeBits =
22513 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22514
22515 SDValue UpperRepresentativeBits =
22516 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22517 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22518 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22519 RepresentativeBits, UpperRepresentativeBits);
22520 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22521 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22522 }
22523
22524 // All other vector sizes.
22525 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22526 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22527 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22528 }
22529
22530 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22531 SDValue RepresentativeBits =
22532 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22533 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22534 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22535 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22536}
22537
22539 StoreSDNode *Store) {
22540 if (!Store->isTruncatingStore())
22541 return SDValue();
22542
22543 SDLoc DL(Store);
22544 SDValue VecOp = Store->getValue();
22545 EVT VT = VecOp.getValueType();
22546 EVT MemVT = Store->getMemoryVT();
22547
22548 if (!MemVT.isVector() || !VT.isVector() ||
22549 MemVT.getVectorElementType() != MVT::i1)
22550 return SDValue();
22551
22552 // If we are storing a vector that we are currently building, let
22553 // `scalarizeVectorStore()` handle this more efficiently.
22554 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22555 return SDValue();
22556
22557 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22558 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22559 if (!VectorBits)
22560 return SDValue();
22561
22562 EVT StoreVT =
22564 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22565 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22566 Store->getMemOperand());
22567}
22568
22570 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22571 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22572 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22573}
22574
22575// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22577 const AArch64Subtarget *Subtarget) {
22578 SDValue Value = ST->getValue();
22579 EVT ValueVT = Value.getValueType();
22580
22581 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22582 Value.getOpcode() != ISD::TRUNCATE ||
22583 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22584 return SDValue();
22585
22586 assert(ST->getOffset().isUndef() && "undef offset expected");
22587 SDLoc DL(ST);
22588 auto WideVT = EVT::getVectorVT(
22589 *DAG.getContext(),
22590 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22591 SDValue UndefVector = DAG.getUNDEF(WideVT);
22592 SDValue WideTrunc = DAG.getNode(
22593 ISD::INSERT_SUBVECTOR, DL, WideVT,
22594 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22595 SDValue Cast = DAG.getNode(
22596 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22597 WideTrunc);
22598
22600 SDValue Chain = ST->getChain();
22601 MachineMemOperand *MMO = ST->getMemOperand();
22602 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22603 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22604 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22605 TypeSize Offset2 = TypeSize::getFixed(2);
22606 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22607 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22608
22609 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22610 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22611 TypeSize Offset1 = TypeSize::getFixed(1);
22612 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22613 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22614
22615 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22616 DAG.getConstant(0, DL, MVT::i64));
22617 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22618 MF.getMachineMemOperand(MMO, 0, 1));
22619 return Chain;
22620}
22621
22624 SelectionDAG &DAG,
22625 const AArch64Subtarget *Subtarget) {
22626 StoreSDNode *ST = cast<StoreSDNode>(N);
22627 SDValue Chain = ST->getChain();
22628 SDValue Value = ST->getValue();
22629 SDValue Ptr = ST->getBasePtr();
22630 EVT ValueVT = Value.getValueType();
22631
22632 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22633 EVT EltVT = VT.getVectorElementType();
22634 return EltVT == MVT::f32 || EltVT == MVT::f64;
22635 };
22636
22637 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22638 return Res;
22639
22640 // If this is an FP_ROUND followed by a store, fold this into a truncating
22641 // store. We can do this even if this is already a truncstore.
22642 // We purposefully don't care about legality of the nodes here as we know
22643 // they can be split down into something legal.
22644 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22645 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22646 Subtarget->useSVEForFixedLengthVectors() &&
22647 ValueVT.isFixedLengthVector() &&
22648 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22649 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22650 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22651 ST->getMemoryVT(), ST->getMemOperand());
22652
22653 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22654 return Split;
22655
22656 if (Subtarget->supportsAddressTopByteIgnored() &&
22657 performTBISimplification(N->getOperand(2), DCI, DAG))
22658 return SDValue(N, 0);
22659
22660 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22661 return Store;
22662
22663 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22664 return Store;
22665
22666 if (ST->isTruncatingStore()) {
22667 EVT StoreVT = ST->getMemoryVT();
22668 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22669 return SDValue();
22670 if (SDValue Rshrnb =
22671 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22672 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22673 StoreVT, ST->getMemOperand());
22674 }
22675 }
22676
22677 return SDValue();
22678}
22679
22682 SelectionDAG &DAG,
22683 const AArch64Subtarget *Subtarget) {
22684 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22685 SDValue Value = MST->getValue();
22686 SDValue Mask = MST->getMask();
22687 SDLoc DL(N);
22688
22689 // If this is a UZP1 followed by a masked store, fold this into a masked
22690 // truncating store. We can do this even if this is already a masked
22691 // truncstore.
22692 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22693 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22694 Value.getValueType().isInteger()) {
22695 Value = Value.getOperand(0);
22696 if (Value.getOpcode() == ISD::BITCAST) {
22697 EVT HalfVT =
22698 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22699 EVT InVT = Value.getOperand(0).getValueType();
22700
22701 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22702 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22703 unsigned PgPattern = Mask->getConstantOperandVal(0);
22704
22705 // Ensure we can double the size of the predicate pattern
22706 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22707 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22708 MinSVESize) {
22709 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22710 PgPattern);
22711 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22712 MST->getBasePtr(), MST->getOffset(), Mask,
22713 MST->getMemoryVT(), MST->getMemOperand(),
22714 MST->getAddressingMode(),
22715 /*IsTruncating=*/true);
22716 }
22717 }
22718 }
22719 }
22720
22721 if (MST->isTruncatingStore()) {
22722 EVT ValueVT = Value->getValueType(0);
22723 EVT MemVT = MST->getMemoryVT();
22724 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22725 return SDValue();
22726 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22727 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22728 MST->getOffset(), MST->getMask(),
22729 MST->getMemoryVT(), MST->getMemOperand(),
22730 MST->getAddressingMode(), true);
22731 }
22732 }
22733
22734 return SDValue();
22735}
22736
22737/// \return true if part of the index was folded into the Base.
22738static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22739 SDLoc DL, SelectionDAG &DAG) {
22740 // This function assumes a vector of i64 indices.
22741 EVT IndexVT = Index.getValueType();
22742 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22743 return false;
22744
22745 // Simplify:
22746 // BasePtr = Ptr
22747 // Index = X + splat(Offset)
22748 // ->
22749 // BasePtr = Ptr + Offset * scale.
22750 // Index = X
22751 if (Index.getOpcode() == ISD::ADD) {
22752 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22753 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22754 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22755 Index = Index.getOperand(0);
22756 return true;
22757 }
22758 }
22759
22760 // Simplify:
22761 // BasePtr = Ptr
22762 // Index = (X + splat(Offset)) << splat(Shift)
22763 // ->
22764 // BasePtr = Ptr + (Offset << Shift) * scale)
22765 // Index = X << splat(shift)
22766 if (Index.getOpcode() == ISD::SHL &&
22767 Index.getOperand(0).getOpcode() == ISD::ADD) {
22768 SDValue Add = Index.getOperand(0);
22769 SDValue ShiftOp = Index.getOperand(1);
22770 SDValue OffsetOp = Add.getOperand(1);
22771 if (auto Shift = DAG.getSplatValue(ShiftOp))
22772 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22773 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22774 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22775 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22776 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22777 Add.getOperand(0), ShiftOp);
22778 return true;
22779 }
22780 }
22781
22782 return false;
22783}
22784
22785// Analyse the specified address returning true if a more optimal addressing
22786// mode is available. When returning true all parameters are updated to reflect
22787// their recommended values.
22789 SDValue &BasePtr, SDValue &Index,
22790 SelectionDAG &DAG) {
22791 // Try to iteratively fold parts of the index into the base pointer to
22792 // simplify the index as much as possible.
22793 bool Changed = false;
22794 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22795 Changed = true;
22796
22797 // Only consider element types that are pointer sized as smaller types can
22798 // be easily promoted.
22799 EVT IndexVT = Index.getValueType();
22800 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22801 return Changed;
22802
22803 // Can indices be trivially shrunk?
22804 EVT DataVT = N->getOperand(1).getValueType();
22805 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22806 // will later be re-extended to 64 bits in legalization
22807 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22808 return Changed;
22809 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22810 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22811 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22812 return true;
22813 }
22814
22815 // Match:
22816 // Index = step(const)
22817 int64_t Stride = 0;
22818 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22819 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22820 }
22821 // Match:
22822 // Index = step(const) << shift(const)
22823 else if (Index.getOpcode() == ISD::SHL &&
22824 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22825 SDValue RHS = Index.getOperand(1);
22826 if (auto *Shift =
22827 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22828 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22829 Stride = Step << Shift->getZExtValue();
22830 }
22831 }
22832
22833 // Return early because no supported pattern is found.
22834 if (Stride == 0)
22835 return Changed;
22836
22837 if (Stride < std::numeric_limits<int32_t>::min() ||
22838 Stride > std::numeric_limits<int32_t>::max())
22839 return Changed;
22840
22841 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22842 unsigned MaxVScale =
22844 int64_t LastElementOffset =
22845 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22846
22847 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22848 LastElementOffset > std::numeric_limits<int32_t>::max())
22849 return Changed;
22850
22851 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22852 // Stride does not scale explicitly by 'Scale', because it happens in
22853 // the gather/scatter addressing mode.
22854 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22855 return true;
22856}
22857
22860 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22861 assert(MGS && "Can only combine gather load or scatter store nodes");
22862
22863 if (!DCI.isBeforeLegalize())
22864 return SDValue();
22865
22866 SDLoc DL(MGS);
22867 SDValue Chain = MGS->getChain();
22868 SDValue Scale = MGS->getScale();
22869 SDValue Index = MGS->getIndex();
22870 SDValue Mask = MGS->getMask();
22871 SDValue BasePtr = MGS->getBasePtr();
22872 ISD::MemIndexType IndexType = MGS->getIndexType();
22873
22874 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22875 return SDValue();
22876
22877 // Here we catch such cases early and change MGATHER's IndexType to allow
22878 // the use of an Index that's more legalisation friendly.
22879 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22880 SDValue PassThru = MGT->getPassThru();
22881 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22882 return DAG.getMaskedGather(
22883 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22884 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22885 }
22886 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22887 SDValue Data = MSC->getValue();
22888 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22889 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22890 Ops, MSC->getMemOperand(), IndexType,
22891 MSC->isTruncatingStore());
22892}
22893
22894/// Target-specific DAG combine function for NEON load/store intrinsics
22895/// to merge base address updates.
22898 SelectionDAG &DAG) {
22899 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22900 return SDValue();
22901
22902 unsigned AddrOpIdx = N->getNumOperands() - 1;
22903 SDValue Addr = N->getOperand(AddrOpIdx);
22904
22905 // Search for a use of the address operand that is an increment.
22906 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22907 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22908 SDNode *User = *UI;
22909 if (User->getOpcode() != ISD::ADD ||
22910 UI.getUse().getResNo() != Addr.getResNo())
22911 continue;
22912
22913 // Check that the add is independent of the load/store. Otherwise, folding
22914 // it would create a cycle.
22917 Visited.insert(Addr.getNode());
22918 Worklist.push_back(N);
22919 Worklist.push_back(User);
22920 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22921 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22922 continue;
22923
22924 // Find the new opcode for the updating load/store.
22925 bool IsStore = false;
22926 bool IsLaneOp = false;
22927 bool IsDupOp = false;
22928 unsigned NewOpc = 0;
22929 unsigned NumVecs = 0;
22930 unsigned IntNo = N->getConstantOperandVal(1);
22931 switch (IntNo) {
22932 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22933 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22934 NumVecs = 2; break;
22935 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22936 NumVecs = 3; break;
22937 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22938 NumVecs = 4; break;
22939 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22940 NumVecs = 2; IsStore = true; break;
22941 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22942 NumVecs = 3; IsStore = true; break;
22943 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22944 NumVecs = 4; IsStore = true; break;
22945 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22946 NumVecs = 2; break;
22947 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22948 NumVecs = 3; break;
22949 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22950 NumVecs = 4; break;
22951 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22952 NumVecs = 2; IsStore = true; break;
22953 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22954 NumVecs = 3; IsStore = true; break;
22955 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22956 NumVecs = 4; IsStore = true; break;
22957 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22958 NumVecs = 2; IsDupOp = true; break;
22959 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22960 NumVecs = 3; IsDupOp = true; break;
22961 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22962 NumVecs = 4; IsDupOp = true; break;
22963 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22964 NumVecs = 2; IsLaneOp = true; break;
22965 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22966 NumVecs = 3; IsLaneOp = true; break;
22967 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22968 NumVecs = 4; IsLaneOp = true; break;
22969 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22970 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22971 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22972 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22973 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22974 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22975 }
22976
22977 EVT VecTy;
22978 if (IsStore)
22979 VecTy = N->getOperand(2).getValueType();
22980 else
22981 VecTy = N->getValueType(0);
22982
22983 // If the increment is a constant, it must match the memory ref size.
22984 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22985 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22986 uint32_t IncVal = CInc->getZExtValue();
22987 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22988 if (IsLaneOp || IsDupOp)
22989 NumBytes /= VecTy.getVectorNumElements();
22990 if (IncVal != NumBytes)
22991 continue;
22992 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22993 }
22995 Ops.push_back(N->getOperand(0)); // Incoming chain
22996 // Load lane and store have vector list as input.
22997 if (IsLaneOp || IsStore)
22998 for (unsigned i = 2; i < AddrOpIdx; ++i)
22999 Ops.push_back(N->getOperand(i));
23000 Ops.push_back(Addr); // Base register
23001 Ops.push_back(Inc);
23002
23003 // Return Types.
23004 EVT Tys[6];
23005 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
23006 unsigned n;
23007 for (n = 0; n < NumResultVecs; ++n)
23008 Tys[n] = VecTy;
23009 Tys[n++] = MVT::i64; // Type of write back register
23010 Tys[n] = MVT::Other; // Type of the chain
23011 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
23012
23013 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
23014 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
23015 MemInt->getMemoryVT(),
23016 MemInt->getMemOperand());
23017
23018 // Update the uses.
23019 std::vector<SDValue> NewResults;
23020 for (unsigned i = 0; i < NumResultVecs; ++i) {
23021 NewResults.push_back(SDValue(UpdN.getNode(), i));
23022 }
23023 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
23024 DCI.CombineTo(N, NewResults);
23025 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
23026
23027 break;
23028 }
23029 return SDValue();
23030}
23031
23032// Checks to see if the value is the prescribed width and returns information
23033// about its extension mode.
23034static
23035bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
23036 ExtType = ISD::NON_EXTLOAD;
23037 switch(V.getNode()->getOpcode()) {
23038 default:
23039 return false;
23040 case ISD::LOAD: {
23041 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
23042 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
23043 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
23044 ExtType = LoadNode->getExtensionType();
23045 return true;
23046 }
23047 return false;
23048 }
23049 case ISD::AssertSext: {
23050 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
23051 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23052 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23053 ExtType = ISD::SEXTLOAD;
23054 return true;
23055 }
23056 return false;
23057 }
23058 case ISD::AssertZext: {
23059 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
23060 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23061 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23062 ExtType = ISD::ZEXTLOAD;
23063 return true;
23064 }
23065 return false;
23066 }
23067 case ISD::Constant:
23068 case ISD::TargetConstant: {
23069 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
23070 1LL << (width - 1);
23071 }
23072 }
23073
23074 return true;
23075}
23076
23077// This function does a whole lot of voodoo to determine if the tests are
23078// equivalent without and with a mask. Essentially what happens is that given a
23079// DAG resembling:
23080//
23081// +-------------+ +-------------+ +-------------+ +-------------+
23082// | Input | | AddConstant | | CompConstant| | CC |
23083// +-------------+ +-------------+ +-------------+ +-------------+
23084// | | | |
23085// V V | +----------+
23086// +-------------+ +----+ | |
23087// | ADD | |0xff| | |
23088// +-------------+ +----+ | |
23089// | | | |
23090// V V | |
23091// +-------------+ | |
23092// | AND | | |
23093// +-------------+ | |
23094// | | |
23095// +-----+ | |
23096// | | |
23097// V V V
23098// +-------------+
23099// | CMP |
23100// +-------------+
23101//
23102// The AND node may be safely removed for some combinations of inputs. In
23103// particular we need to take into account the extension type of the Input,
23104// the exact values of AddConstant, CompConstant, and CC, along with the nominal
23105// width of the input (this can work for any width inputs, the above graph is
23106// specific to 8 bits.
23107//
23108// The specific equations were worked out by generating output tables for each
23109// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
23110// problem was simplified by working with 4 bit inputs, which means we only
23111// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
23112// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
23113// patterns present in both extensions (0,7). For every distinct set of
23114// AddConstant and CompConstants bit patterns we can consider the masked and
23115// unmasked versions to be equivalent if the result of this function is true for
23116// all 16 distinct bit patterns of for the current extension type of Input (w0).
23117//
23118// sub w8, w0, w1
23119// and w10, w8, #0x0f
23120// cmp w8, w2
23121// cset w9, AArch64CC
23122// cmp w10, w2
23123// cset w11, AArch64CC
23124// cmp w9, w11
23125// cset w0, eq
23126// ret
23127//
23128// Since the above function shows when the outputs are equivalent it defines
23129// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
23130// would be expensive to run during compiles. The equations below were written
23131// in a test harness that confirmed they gave equivalent outputs to the above
23132// for all inputs function, so they can be used determine if the removal is
23133// legal instead.
23134//
23135// isEquivalentMaskless() is the code for testing if the AND can be removed
23136// factored out of the DAG recognition as the DAG can take several forms.
23137
23138static bool isEquivalentMaskless(unsigned CC, unsigned width,
23139 ISD::LoadExtType ExtType, int AddConstant,
23140 int CompConstant) {
23141 // By being careful about our equations and only writing the in term
23142 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
23143 // make them generally applicable to all bit widths.
23144 int MaxUInt = (1 << width);
23145
23146 // For the purposes of these comparisons sign extending the type is
23147 // equivalent to zero extending the add and displacing it by half the integer
23148 // width. Provided we are careful and make sure our equations are valid over
23149 // the whole range we can just adjust the input and avoid writing equations
23150 // for sign extended inputs.
23151 if (ExtType == ISD::SEXTLOAD)
23152 AddConstant -= (1 << (width-1));
23153
23154 switch(CC) {
23155 case AArch64CC::LE:
23156 case AArch64CC::GT:
23157 if ((AddConstant == 0) ||
23158 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
23159 (AddConstant >= 0 && CompConstant < 0) ||
23160 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
23161 return true;
23162 break;
23163 case AArch64CC::LT:
23164 case AArch64CC::GE:
23165 if ((AddConstant == 0) ||
23166 (AddConstant >= 0 && CompConstant <= 0) ||
23167 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
23168 return true;
23169 break;
23170 case AArch64CC::HI:
23171 case AArch64CC::LS:
23172 if ((AddConstant >= 0 && CompConstant < 0) ||
23173 (AddConstant <= 0 && CompConstant >= -1 &&
23174 CompConstant < AddConstant + MaxUInt))
23175 return true;
23176 break;
23177 case AArch64CC::PL:
23178 case AArch64CC::MI:
23179 if ((AddConstant == 0) ||
23180 (AddConstant > 0 && CompConstant <= 0) ||
23181 (AddConstant < 0 && CompConstant <= AddConstant))
23182 return true;
23183 break;
23184 case AArch64CC::LO:
23185 case AArch64CC::HS:
23186 if ((AddConstant >= 0 && CompConstant <= 0) ||
23187 (AddConstant <= 0 && CompConstant >= 0 &&
23188 CompConstant <= AddConstant + MaxUInt))
23189 return true;
23190 break;
23191 case AArch64CC::EQ:
23192 case AArch64CC::NE:
23193 if ((AddConstant > 0 && CompConstant < 0) ||
23194 (AddConstant < 0 && CompConstant >= 0 &&
23195 CompConstant < AddConstant + MaxUInt) ||
23196 (AddConstant >= 0 && CompConstant >= 0 &&
23197 CompConstant >= AddConstant) ||
23198 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
23199 return true;
23200 break;
23201 case AArch64CC::VS:
23202 case AArch64CC::VC:
23203 case AArch64CC::AL:
23204 case AArch64CC::NV:
23205 return true;
23206 case AArch64CC::Invalid:
23207 break;
23208 }
23209
23210 return false;
23211}
23212
23213// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
23214// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
23216 SDNode *AndNode, SelectionDAG &DAG,
23217 unsigned CCIndex, unsigned CmpIndex,
23218 unsigned CC) {
23219 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
23220 if (!SubsC)
23221 return SDValue();
23222
23223 APInt SubsAP = SubsC->getAPIntValue();
23224 if (CC == AArch64CC::HI) {
23225 if (!SubsAP.isMask())
23226 return SDValue();
23227 } else if (CC == AArch64CC::LO) {
23228 if (!SubsAP.isPowerOf2())
23229 return SDValue();
23230 } else
23231 return SDValue();
23232
23233 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
23234 if (!AndC)
23235 return SDValue();
23236
23237 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
23238
23239 SDLoc DL(N);
23240 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
23241 SDValue ANDS = DAG.getNode(
23242 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
23243 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
23244 SDValue AArch64_CC =
23246 N->getOperand(CCIndex)->getValueType(0));
23247
23248 // For now, only performCSELCombine and performBRCONDCombine call this
23249 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
23250 // operands. So just init the ops direct to simplify the code. If we have some
23251 // other case with different CCIndex, CmpIndex, we need to use for loop to
23252 // rewrite the code here.
23253 // TODO: Do we need to assert number of operand is 4 here?
23254 assert((CCIndex == 2 && CmpIndex == 3) &&
23255 "Expected CCIndex to be 2 and CmpIndex to be 3.");
23256 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
23257 ANDS.getValue(1)};
23258 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
23259}
23260
23261static
23264 SelectionDAG &DAG, unsigned CCIndex,
23265 unsigned CmpIndex) {
23266 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
23267 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
23268 unsigned CondOpcode = SubsNode->getOpcode();
23269
23270 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
23271 !SubsNode->hasOneUse())
23272 return SDValue();
23273
23274 // There is a SUBS feeding this condition. Is it fed by a mask we can
23275 // use?
23276
23277 SDNode *AndNode = SubsNode->getOperand(0).getNode();
23278 unsigned MaskBits = 0;
23279
23280 if (AndNode->getOpcode() != ISD::AND)
23281 return SDValue();
23282
23283 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
23284 CmpIndex, CC))
23285 return Val;
23286
23287 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
23288 uint32_t CNV = CN->getZExtValue();
23289 if (CNV == 255)
23290 MaskBits = 8;
23291 else if (CNV == 65535)
23292 MaskBits = 16;
23293 }
23294
23295 if (!MaskBits)
23296 return SDValue();
23297
23298 SDValue AddValue = AndNode->getOperand(0);
23299
23300 if (AddValue.getOpcode() != ISD::ADD)
23301 return SDValue();
23302
23303 // The basic dag structure is correct, grab the inputs and validate them.
23304
23305 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
23306 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
23307 SDValue SubsInputValue = SubsNode->getOperand(1);
23308
23309 // The mask is present and the provenance of all the values is a smaller type,
23310 // lets see if the mask is superfluous.
23311
23312 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
23313 !isa<ConstantSDNode>(SubsInputValue.getNode()))
23314 return SDValue();
23315
23316 ISD::LoadExtType ExtType;
23317
23318 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
23319 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
23320 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
23321 return SDValue();
23322
23323 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
23324 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
23325 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
23326 return SDValue();
23327
23328 // The AND is not necessary, remove it.
23329
23330 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
23331 SubsNode->getValueType(1));
23332 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
23333
23334 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
23335 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
23336
23337 return SDValue(N, 0);
23338}
23339
23340// Optimize compare with zero and branch.
23343 SelectionDAG &DAG) {
23345 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
23346 // will not be produced, as they are conditional branch instructions that do
23347 // not set flags.
23348 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
23349 return SDValue();
23350
23351 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
23352 N = NV.getNode();
23353 SDValue Chain = N->getOperand(0);
23354 SDValue Dest = N->getOperand(1);
23355 SDValue CCVal = N->getOperand(2);
23356 SDValue Cmp = N->getOperand(3);
23357
23358 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
23359 unsigned CC = CCVal->getAsZExtVal();
23360 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
23361 return SDValue();
23362
23363 unsigned CmpOpc = Cmp.getOpcode();
23364 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23365 return SDValue();
23366
23367 // Only attempt folding if there is only one use of the flag and no use of the
23368 // value.
23369 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
23370 return SDValue();
23371
23372 SDValue LHS = Cmp.getOperand(0);
23373 SDValue RHS = Cmp.getOperand(1);
23374
23375 assert(LHS.getValueType() == RHS.getValueType() &&
23376 "Expected the value type to be the same for both operands!");
23377 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23378 return SDValue();
23379
23380 if (isNullConstant(LHS))
23381 std::swap(LHS, RHS);
23382
23383 if (!isNullConstant(RHS))
23384 return SDValue();
23385
23386 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
23387 LHS.getOpcode() == ISD::SRL)
23388 return SDValue();
23389
23390 // Fold the compare into the branch instruction.
23391 SDValue BR;
23392 if (CC == AArch64CC::EQ)
23393 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23394 else
23395 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23396
23397 // Do not add new nodes to DAG combiner worklist.
23398 DCI.CombineTo(N, BR, false);
23399
23400 return SDValue();
23401}
23402
23404 unsigned CC = N->getConstantOperandVal(2);
23405 SDValue SUBS = N->getOperand(3);
23406 SDValue Zero, CTTZ;
23407
23408 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23409 Zero = N->getOperand(0);
23410 CTTZ = N->getOperand(1);
23411 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23412 Zero = N->getOperand(1);
23413 CTTZ = N->getOperand(0);
23414 } else
23415 return SDValue();
23416
23417 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
23418 (CTTZ.getOpcode() == ISD::TRUNCATE &&
23419 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
23420 return SDValue();
23421
23422 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23423 "Illegal type in CTTZ folding");
23424
23425 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
23426 return SDValue();
23427
23428 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23429 ? CTTZ.getOperand(0).getOperand(0)
23430 : CTTZ.getOperand(0);
23431
23432 if (X != SUBS.getOperand(0))
23433 return SDValue();
23434
23435 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23436 ? CTTZ.getOperand(0).getValueSizeInBits()
23437 : CTTZ.getValueSizeInBits();
23438 SDValue BitWidthMinusOne =
23439 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23440 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23441 BitWidthMinusOne);
23442}
23443
23444// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23445// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23446// Where x and y are constants and x != y
23447
23448// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23449// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23450// Where x and y are constants and x != y
23452 SDValue L = Op->getOperand(0);
23453 SDValue R = Op->getOperand(1);
23454 AArch64CC::CondCode OpCC =
23455 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23456
23457 SDValue OpCmp = Op->getOperand(3);
23458 if (!isCMP(OpCmp))
23459 return SDValue();
23460
23461 SDValue CmpLHS = OpCmp.getOperand(0);
23462 SDValue CmpRHS = OpCmp.getOperand(1);
23463
23464 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23465 std::swap(CmpLHS, CmpRHS);
23466 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23467 return SDValue();
23468
23469 SDValue X = CmpLHS->getOperand(0);
23470 SDValue Y = CmpLHS->getOperand(1);
23471 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23472 return SDValue();
23473 }
23474
23475 // If one of the constant is opaque constant, x,y sdnode is still different
23476 // but the real value maybe the same. So check APInt here to make sure the
23477 // code is correct.
23478 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23479 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23480 if (CX->getAPIntValue() == CY->getAPIntValue())
23481 return SDValue();
23482
23484 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23485 SDValue Cond = CmpLHS->getOperand(3);
23486
23487 if (CmpRHS == Y)
23489 else if (CmpRHS != X)
23490 return SDValue();
23491
23492 if (OpCC == AArch64CC::NE)
23494 else if (OpCC != AArch64CC::EQ)
23495 return SDValue();
23496
23497 SDLoc DL(Op);
23498 EVT VT = Op->getValueType(0);
23499
23500 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23501 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23502}
23503
23504// Optimize CSEL instructions
23507 SelectionDAG &DAG) {
23508 // CSEL x, x, cc -> x
23509 if (N->getOperand(0) == N->getOperand(1))
23510 return N->getOperand(0);
23511
23512 if (SDValue R = foldCSELOfCSEL(N, DAG))
23513 return R;
23514
23515 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23516 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23517 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23518 return Folded;
23519
23520 return performCONDCombine(N, DCI, DAG, 2, 3);
23521}
23522
23523// Try to re-use an already extended operand of a vector SetCC feeding a
23524// extended select. Doing so avoids requiring another full extension of the
23525// SET_CC result when lowering the select.
23527 EVT Op0MVT = Op->getOperand(0).getValueType();
23528 if (!Op0MVT.isVector() || Op->use_empty())
23529 return SDValue();
23530
23531 // Make sure that all uses of Op are VSELECTs with result matching types where
23532 // the result type has a larger element type than the SetCC operand.
23533 SDNode *FirstUse = *Op->use_begin();
23534 if (FirstUse->getOpcode() != ISD::VSELECT)
23535 return SDValue();
23536 EVT UseMVT = FirstUse->getValueType(0);
23537 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23538 return SDValue();
23539 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23540 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23541 }))
23542 return SDValue();
23543
23544 APInt V;
23545 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23546 return SDValue();
23547
23548 SDLoc DL(Op);
23549 SDValue Op0ExtV;
23550 SDValue Op1ExtV;
23551 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23552 // Check if the first operand of the SET_CC is already extended. If it is,
23553 // split the SET_CC and re-use the extended version of the operand.
23554 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23555 Op->getOperand(0));
23556 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23557 Op->getOperand(0));
23558 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23559 Op0ExtV = SDValue(Op0SExt, 0);
23560 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23561 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23562 Op0ExtV = SDValue(Op0ZExt, 0);
23563 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23564 } else
23565 return SDValue();
23566
23567 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23568 Op0ExtV, Op1ExtV, Op->getOperand(2));
23569}
23570
23571static SDValue
23573 SelectionDAG &DAG) {
23574 SDValue Vec = N->getOperand(0);
23575 if (DCI.isBeforeLegalize() &&
23576 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23579 SDLoc DL(N);
23580 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23581 DAG);
23582 }
23583
23584 return SDValue();
23585}
23586
23589 SelectionDAG &DAG) {
23590 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23591 SDValue LHS = N->getOperand(0);
23592 SDValue RHS = N->getOperand(1);
23593 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23594 SDLoc DL(N);
23595 EVT VT = N->getValueType(0);
23596
23597 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23598 return V;
23599
23600 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23601 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23602 LHS->getOpcode() == AArch64ISD::CSEL &&
23603 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23604 LHS->hasOneUse()) {
23605 // Invert CSEL's condition.
23606 auto OldCond =
23607 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23608 auto NewCond = getInvertedCondCode(OldCond);
23609
23610 // csel 0, 1, !cond, X
23611 SDValue CSEL =
23612 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23613 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23614 LHS.getOperand(3));
23615 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23616 }
23617
23618 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23619 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23620 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23621 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23622 LHS->hasOneUse()) {
23623 EVT TstVT = LHS->getValueType(0);
23624 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23625 // this pattern will get better opt in emitComparison
23626 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23627 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23628 DAG.getConstant(TstImm, DL, TstVT));
23629 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23630 }
23631 }
23632
23633 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23634 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23635 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23636 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23637 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23638 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23640 LHS->getOpcode() == ISD::BITCAST) {
23641 EVT ToVT = LHS->getValueType(0);
23642 EVT FromVT = LHS->getOperand(0).getValueType();
23643 if (FromVT.isFixedLengthVector() &&
23644 FromVT.getVectorElementType() == MVT::i1) {
23645 bool IsNull = isNullConstant(RHS);
23647 DL, MVT::i1, LHS->getOperand(0));
23648 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23649 LHS);
23650 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23651 }
23652 }
23653
23654 // Try to perform the memcmp when the result is tested for [in]equality with 0
23655 if (SDValue V = performOrXorChainCombine(N, DAG))
23656 return V;
23657
23658 return SDValue();
23659}
23660
23661// Replace a flag-setting operator (eg ANDS) with the generic version
23662// (eg AND) if the flag is unused.
23665 unsigned GenericOpcode) {
23666 SDLoc DL(N);
23667 SDValue LHS = N->getOperand(0);
23668 SDValue RHS = N->getOperand(1);
23669 EVT VT = N->getValueType(0);
23670
23671 // If the flag result isn't used, convert back to a generic opcode.
23672 if (!N->hasAnyUseOfValue(1)) {
23673 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23674 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23675 DL);
23676 }
23677
23678 // Combine identical generic nodes into this node, re-using the result.
23679 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23680 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23681 DCI.CombineTo(Generic, SDValue(N, 0));
23682
23683 return SDValue();
23684}
23685
23687 // setcc_merge_zero pred
23688 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23689 // => extract_subvector (inner setcc_merge_zero)
23690 SDValue Pred = N->getOperand(0);
23691 SDValue LHS = N->getOperand(1);
23692 SDValue RHS = N->getOperand(2);
23693 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23694
23695 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23696 LHS->getOpcode() != ISD::SIGN_EXTEND)
23697 return SDValue();
23698
23699 SDValue Extract = LHS->getOperand(0);
23700 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23701 Extract->getValueType(0) != N->getValueType(0) ||
23702 Extract->getConstantOperandVal(1) != 0)
23703 return SDValue();
23704
23705 SDValue InnerSetCC = Extract->getOperand(0);
23706 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23707 return SDValue();
23708
23709 // By this point we've effectively got
23710 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23711 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23712 // can operate on A directly.
23713 SDValue InnerPred = InnerSetCC.getOperand(0);
23714 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23715 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23716 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23717 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23718 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23719 return Extract;
23720
23721 return SDValue();
23722}
23723
23724static SDValue
23726 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23727 "Unexpected opcode!");
23728
23729 SelectionDAG &DAG = DCI.DAG;
23730 SDValue Pred = N->getOperand(0);
23731 SDValue LHS = N->getOperand(1);
23732 SDValue RHS = N->getOperand(2);
23733 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23734
23735 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23736 return V;
23737
23738 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23739 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23740 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23741 // setcc_merge_zero(
23742 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23743 // => setcc_merge_zero(pred, ...)
23744 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23745 LHS->getOperand(0)->getOperand(0) == Pred)
23746 return LHS->getOperand(0);
23747
23748 // setcc_merge_zero(
23749 // all_active, extend(nxvNi1 ...), != splat(0))
23750 // -> nxvNi1 ...
23751 if (isAllActivePredicate(DAG, Pred))
23752 return LHS->getOperand(0);
23753
23754 // setcc_merge_zero(
23755 // pred, extend(nxvNi1 ...), != splat(0))
23756 // -> nxvNi1 and(pred, ...)
23757 if (DCI.isAfterLegalizeDAG())
23758 // Do this after legalization to allow more folds on setcc_merge_zero
23759 // to be recognized.
23760 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23761 LHS->getOperand(0), Pred);
23762 }
23763
23764 return SDValue();
23765}
23766
23767// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23768// as well as whether the test should be inverted. This code is required to
23769// catch these cases (as opposed to standard dag combines) because
23770// AArch64ISD::TBZ is matched during legalization.
23771static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23772 SelectionDAG &DAG) {
23773
23774 if (!Op->hasOneUse())
23775 return Op;
23776
23777 // We don't handle undef/constant-fold cases below, as they should have
23778 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23779 // etc.)
23780
23781 // (tbz (trunc x), b) -> (tbz x, b)
23782 // This case is just here to enable more of the below cases to be caught.
23783 if (Op->getOpcode() == ISD::TRUNCATE &&
23784 Bit < Op->getValueType(0).getSizeInBits()) {
23785 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23786 }
23787
23788 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23789 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23790 Bit < Op->getOperand(0).getValueSizeInBits()) {
23791 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23792 }
23793
23794 if (Op->getNumOperands() != 2)
23795 return Op;
23796
23797 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23798 if (!C)
23799 return Op;
23800
23801 switch (Op->getOpcode()) {
23802 default:
23803 return Op;
23804
23805 // (tbz (and x, m), b) -> (tbz x, b)
23806 case ISD::AND:
23807 if ((C->getZExtValue() >> Bit) & 1)
23808 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23809 return Op;
23810
23811 // (tbz (shl x, c), b) -> (tbz x, b-c)
23812 case ISD::SHL:
23813 if (C->getZExtValue() <= Bit &&
23814 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23815 Bit = Bit - C->getZExtValue();
23816 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23817 }
23818 return Op;
23819
23820 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23821 case ISD::SRA:
23822 Bit = Bit + C->getZExtValue();
23823 if (Bit >= Op->getValueType(0).getSizeInBits())
23824 Bit = Op->getValueType(0).getSizeInBits() - 1;
23825 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23826
23827 // (tbz (srl x, c), b) -> (tbz x, b+c)
23828 case ISD::SRL:
23829 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23830 Bit = Bit + C->getZExtValue();
23831 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23832 }
23833 return Op;
23834
23835 // (tbz (xor x, -1), b) -> (tbnz x, b)
23836 case ISD::XOR:
23837 if ((C->getZExtValue() >> Bit) & 1)
23838 Invert = !Invert;
23839 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23840 }
23841}
23842
23843// Optimize test single bit zero/non-zero and branch.
23846 SelectionDAG &DAG) {
23847 unsigned Bit = N->getConstantOperandVal(2);
23848 bool Invert = false;
23849 SDValue TestSrc = N->getOperand(1);
23850 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23851
23852 if (TestSrc == NewTestSrc)
23853 return SDValue();
23854
23855 unsigned NewOpc = N->getOpcode();
23856 if (Invert) {
23857 if (NewOpc == AArch64ISD::TBZ)
23858 NewOpc = AArch64ISD::TBNZ;
23859 else {
23860 assert(NewOpc == AArch64ISD::TBNZ);
23861 NewOpc = AArch64ISD::TBZ;
23862 }
23863 }
23864
23865 SDLoc DL(N);
23866 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23867 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23868}
23869
23870// Swap vselect operands where it may allow a predicated operation to achieve
23871// the `sel`.
23872//
23873// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23874// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23876 auto SelectA = N->getOperand(1);
23877 auto SelectB = N->getOperand(2);
23878 auto NTy = N->getValueType(0);
23879
23880 if (!NTy.isScalableVector())
23881 return SDValue();
23882 SDValue SetCC = N->getOperand(0);
23883 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23884 return SDValue();
23885
23886 switch (SelectB.getOpcode()) {
23887 default:
23888 return SDValue();
23889 case ISD::FMUL:
23890 case ISD::FSUB:
23891 case ISD::FADD:
23892 break;
23893 }
23894 if (SelectA != SelectB.getOperand(0))
23895 return SDValue();
23896
23897 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23898 ISD::CondCode InverseCC =
23900 auto InverseSetCC =
23901 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23902 SetCC.getOperand(1), InverseCC);
23903
23904 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23905 {InverseSetCC, SelectB, SelectA});
23906}
23907
23908// vselect (v1i1 setcc) ->
23909// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23910// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23911// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23912// such VSELECT.
23914 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23915 return SwapResult;
23916
23917 SDValue N0 = N->getOperand(0);
23918 EVT CCVT = N0.getValueType();
23919
23920 if (isAllActivePredicate(DAG, N0))
23921 return N->getOperand(1);
23922
23923 if (isAllInactivePredicate(N0))
23924 return N->getOperand(2);
23925
23926 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23927 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23928 // supported types.
23929 SDValue SetCC = N->getOperand(0);
23930 if (SetCC.getOpcode() == ISD::SETCC &&
23931 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23932 SDValue CmpLHS = SetCC.getOperand(0);
23933 EVT VT = CmpLHS.getValueType();
23934 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23935 SDNode *SplatLHS = N->getOperand(1).getNode();
23936 SDNode *SplatRHS = N->getOperand(2).getNode();
23937 APInt SplatLHSVal;
23938 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23939 VT.isSimple() &&
23940 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23941 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23942 VT.getSimpleVT().SimpleTy) &&
23943 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23944 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23946 unsigned NumElts = VT.getVectorNumElements();
23948 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23949 VT.getScalarType()));
23950 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23951
23952 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23953 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23954 return Or;
23955 }
23956 }
23957
23958 EVT CmpVT = N0.getOperand(0).getValueType();
23959 if (N0.getOpcode() != ISD::SETCC ||
23961 CCVT.getVectorElementType() != MVT::i1 ||
23963 return SDValue();
23964
23965 EVT ResVT = N->getValueType(0);
23966 // Only combine when the result type is of the same size as the compared
23967 // operands.
23968 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23969 return SDValue();
23970
23971 SDValue IfTrue = N->getOperand(1);
23972 SDValue IfFalse = N->getOperand(2);
23973 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23974 N0.getOperand(0), N0.getOperand(1),
23975 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23976 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23977 IfTrue, IfFalse);
23978}
23979
23980/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23981/// the compare-mask instructions rather than going via NZCV, even if LHS and
23982/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23983/// with a vector one followed by a DUP shuffle on the result.
23986 SelectionDAG &DAG = DCI.DAG;
23987 SDValue N0 = N->getOperand(0);
23988 EVT ResVT = N->getValueType(0);
23989
23990 if (N0.getOpcode() != ISD::SETCC)
23991 return SDValue();
23992
23993 if (ResVT.isScalableVT())
23994 return SDValue();
23995
23996 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23997 // scalar SetCCResultType. We also don't expect vectors, because we assume
23998 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23999 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
24000 "Scalar-SETCC feeding SELECT has unexpected result type!");
24001
24002 // If NumMaskElts == 0, the comparison is larger than select result. The
24003 // largest real NEON comparison is 64-bits per lane, which means the result is
24004 // at most 32-bits and an illegal vector. Just bail out for now.
24005 EVT SrcVT = N0.getOperand(0).getValueType();
24006
24007 // Don't try to do this optimization when the setcc itself has i1 operands.
24008 // There are no legal vectors of i1, so this would be pointless. v1f16 is
24009 // ruled out to prevent the creation of setcc that need to be scalarized.
24010 if (SrcVT == MVT::i1 ||
24011 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
24012 return SDValue();
24013
24014 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
24015 if (!ResVT.isVector() || NumMaskElts == 0)
24016 return SDValue();
24017
24018 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
24020
24021 // Also bail out if the vector CCVT isn't the same size as ResVT.
24022 // This can happen if the SETCC operand size doesn't divide the ResVT size
24023 // (e.g., f64 vs v3f32).
24024 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
24025 return SDValue();
24026
24027 // Make sure we didn't create illegal types, if we're not supposed to.
24028 assert(DCI.isBeforeLegalize() ||
24029 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
24030
24031 // First perform a vector comparison, where lane 0 is the one we're interested
24032 // in.
24033 SDLoc DL(N0);
24034 SDValue LHS =
24035 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
24036 SDValue RHS =
24037 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
24038 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
24039
24040 // Now duplicate the comparison mask we want across all other lanes.
24041 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
24042 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
24043 Mask = DAG.getNode(ISD::BITCAST, DL,
24044 ResVT.changeVectorElementTypeToInteger(), Mask);
24045
24046 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
24047}
24048
24051 EVT VT = N->getValueType(0);
24052 SDLoc DL(N);
24053 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
24054 // 128bit vector version.
24055 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
24057 SmallVector<SDValue> Ops(N->ops());
24058 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
24059 DCI.DAG.getVTList(LVT), Ops)) {
24060 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
24061 DCI.DAG.getConstant(0, DL, MVT::i64));
24062 }
24063 }
24064
24065 if (N->getOpcode() == AArch64ISD::DUP) {
24066 if (DCI.isAfterLegalizeDAG()) {
24067 // If scalar dup's operand is extract_vector_elt, try to combine them into
24068 // duplane. For example,
24069 //
24070 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
24071 // t18: v4i32 = AArch64ISD::DUP t21
24072 // ==>
24073 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
24074 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
24075 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24076 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
24077 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
24078 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
24079 EXTRACT_VEC_ELT.getOperand(1));
24080 }
24081 }
24082 }
24083
24084 return performPostLD1Combine(N, DCI, false);
24085 }
24086
24087 return SDValue();
24088}
24089
24090/// Get rid of unnecessary NVCASTs (that don't change the type).
24092 if (N->getValueType(0) == N->getOperand(0).getValueType())
24093 return N->getOperand(0);
24094 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
24095 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
24096 N->getOperand(0).getOperand(0));
24097
24098 return SDValue();
24099}
24100
24101// If all users of the globaladdr are of the form (globaladdr + constant), find
24102// the smallest constant, fold it into the globaladdr's offset and rewrite the
24103// globaladdr as (globaladdr + constant) - constant.
24105 const AArch64Subtarget *Subtarget,
24106 const TargetMachine &TM) {
24107 auto *GN = cast<GlobalAddressSDNode>(N);
24108 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
24110 return SDValue();
24111
24112 uint64_t MinOffset = -1ull;
24113 for (SDNode *N : GN->uses()) {
24114 if (N->getOpcode() != ISD::ADD)
24115 return SDValue();
24116 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
24117 if (!C)
24118 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24119 if (!C)
24120 return SDValue();
24121 MinOffset = std::min(MinOffset, C->getZExtValue());
24122 }
24123 uint64_t Offset = MinOffset + GN->getOffset();
24124
24125 // Require that the new offset is larger than the existing one. Otherwise, we
24126 // can end up oscillating between two possible DAGs, for example,
24127 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
24128 if (Offset <= uint64_t(GN->getOffset()))
24129 return SDValue();
24130
24131 // Check whether folding this offset is legal. It must not go out of bounds of
24132 // the referenced object to avoid violating the code model, and must be
24133 // smaller than 2^20 because this is the largest offset expressible in all
24134 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
24135 // stores an immediate signed 21 bit offset.)
24136 //
24137 // This check also prevents us from folding negative offsets, which will end
24138 // up being treated in the same way as large positive ones. They could also
24139 // cause code model violations, and aren't really common enough to matter.
24140 if (Offset >= (1 << 20))
24141 return SDValue();
24142
24143 const GlobalValue *GV = GN->getGlobal();
24144 Type *T = GV->getValueType();
24145 if (!T->isSized() ||
24147 return SDValue();
24148
24149 SDLoc DL(GN);
24150 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
24151 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
24152 DAG.getConstant(MinOffset, DL, MVT::i64));
24153}
24154
24156 const AArch64Subtarget *Subtarget) {
24157 SDValue BR = N->getOperand(0);
24158 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
24159 !BR.getValueType().isScalarInteger())
24160 return SDValue();
24161
24162 SDLoc DL(N);
24163 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
24164}
24165
24166// Turns the vector of indices into a vector of byte offstes by scaling Offset
24167// by (BitWidth / 8).
24169 SDLoc DL, unsigned BitWidth) {
24170 assert(Offset.getValueType().isScalableVector() &&
24171 "This method is only for scalable vectors of offsets");
24172
24173 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
24174 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
24175
24176 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
24177}
24178
24179/// Check if the value of \p OffsetInBytes can be used as an immediate for
24180/// the gather load/prefetch and scatter store instructions with vector base and
24181/// immediate offset addressing mode:
24182///
24183/// [<Zn>.[S|D]{, #<imm>}]
24184///
24185/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24186inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
24187 unsigned ScalarSizeInBytes) {
24188 // The immediate is not a multiple of the scalar size.
24189 if (OffsetInBytes % ScalarSizeInBytes)
24190 return false;
24191
24192 // The immediate is out of range.
24193 if (OffsetInBytes / ScalarSizeInBytes > 31)
24194 return false;
24195
24196 return true;
24197}
24198
24199/// Check if the value of \p Offset represents a valid immediate for the SVE
24200/// gather load/prefetch and scatter store instructiona with vector base and
24201/// immediate offset addressing mode:
24202///
24203/// [<Zn>.[S|D]{, #<imm>}]
24204///
24205/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24207 unsigned ScalarSizeInBytes) {
24208 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
24209 return OffsetConst && isValidImmForSVEVecImmAddrMode(
24210 OffsetConst->getZExtValue(), ScalarSizeInBytes);
24211}
24212
24214 unsigned Opcode,
24215 bool OnlyPackedOffsets = true) {
24216 const SDValue Src = N->getOperand(2);
24217 const EVT SrcVT = Src->getValueType(0);
24218 assert(SrcVT.isScalableVector() &&
24219 "Scatter stores are only possible for SVE vectors");
24220
24221 SDLoc DL(N);
24222 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
24223
24224 // Make sure that source data will fit into an SVE register
24226 return SDValue();
24227
24228 // For FPs, ACLE only supports _packed_ single and double precision types.
24229 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
24230 if (SrcElVT.isFloatingPoint())
24231 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
24232 ((Opcode != AArch64ISD::SST1Q_PRED &&
24233 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
24234 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
24235 return SDValue();
24236
24237 // Depending on the addressing mode, this is either a pointer or a vector of
24238 // pointers (that fits into one register)
24239 SDValue Base = N->getOperand(4);
24240 // Depending on the addressing mode, this is either a single offset or a
24241 // vector of offsets (that fits into one register)
24242 SDValue Offset = N->getOperand(5);
24243
24244 // For "scalar + vector of indices", just scale the indices. This only
24245 // applies to non-temporal scatters because there's no instruction that takes
24246 // indices.
24247 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
24248 Offset =
24250 Opcode = AArch64ISD::SSTNT1_PRED;
24251 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
24252 Offset =
24254 Opcode = AArch64ISD::SST1Q_PRED;
24255 }
24256
24257 // In the case of non-temporal gather loads there's only one SVE instruction
24258 // per data-size: "scalar + vector", i.e.
24259 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24260 // Since we do have intrinsics that allow the arguments to be in a different
24261 // order, we may need to swap them to match the spec.
24262 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
24263 Offset.getValueType().isVector())
24265
24266 // SST1_IMM requires that the offset is an immediate that is:
24267 // * a multiple of #SizeInBytes,
24268 // * in the range [0, 31 x #SizeInBytes],
24269 // where #SizeInBytes is the size in bytes of the stored items. For
24270 // immediates outside that range and non-immediate scalar offsets use SST1 or
24271 // SST1_UXTW instead.
24272 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
24274 SrcVT.getScalarSizeInBits() / 8)) {
24275 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24277 else
24278 Opcode = AArch64ISD::SST1_PRED;
24279
24281 }
24282 }
24283
24284 auto &TLI = DAG.getTargetLoweringInfo();
24285 if (!TLI.isTypeLegal(Base.getValueType()))
24286 return SDValue();
24287
24288 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
24289 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24290 // nxv2i64. Legalize accordingly.
24291 if (!OnlyPackedOffsets &&
24292 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24293 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24294
24295 if (!TLI.isTypeLegal(Offset.getValueType()))
24296 return SDValue();
24297
24298 // Source value type that is representable in hardware
24299 EVT HwSrcVt = getSVEContainerType(SrcVT);
24300
24301 // Keep the original type of the input data to store - this is needed to be
24302 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
24303 // FP values we want the integer equivalent, so just use HwSrcVt.
24304 SDValue InputVT = DAG.getValueType(SrcVT);
24305 if (SrcVT.isFloatingPoint())
24306 InputVT = DAG.getValueType(HwSrcVt);
24307
24308 SDVTList VTs = DAG.getVTList(MVT::Other);
24309 SDValue SrcNew;
24310
24311 if (Src.getValueType().isFloatingPoint())
24312 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
24313 else
24314 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
24315
24316 SDValue Ops[] = {N->getOperand(0), // Chain
24317 SrcNew,
24318 N->getOperand(3), // Pg
24319 Base,
24320 Offset,
24321 InputVT};
24322
24323 return DAG.getNode(Opcode, DL, VTs, Ops);
24324}
24325
24327 unsigned Opcode,
24328 bool OnlyPackedOffsets = true) {
24329 const EVT RetVT = N->getValueType(0);
24330 assert(RetVT.isScalableVector() &&
24331 "Gather loads are only possible for SVE vectors");
24332
24333 SDLoc DL(N);
24334
24335 // Make sure that the loaded data will fit into an SVE register
24337 return SDValue();
24338
24339 // Depending on the addressing mode, this is either a pointer or a vector of
24340 // pointers (that fits into one register)
24341 SDValue Base = N->getOperand(3);
24342 // Depending on the addressing mode, this is either a single offset or a
24343 // vector of offsets (that fits into one register)
24344 SDValue Offset = N->getOperand(4);
24345
24346 // For "scalar + vector of indices", scale the indices to obtain unscaled
24347 // offsets. This applies to non-temporal and quadword gathers, which do not
24348 // have an addressing mode with scaled offset.
24351 RetVT.getScalarSizeInBits());
24353 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
24355 RetVT.getScalarSizeInBits());
24357 }
24358
24359 // In the case of non-temporal gather loads and quadword gather loads there's
24360 // only one addressing mode : "vector + scalar", e.g.
24361 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24362 // Since we do have intrinsics that allow the arguments to be in a different
24363 // order, we may need to swap them to match the spec.
24364 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
24365 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24366 Offset.getValueType().isVector())
24368
24369 // GLD{FF}1_IMM requires that the offset is an immediate that is:
24370 // * a multiple of #SizeInBytes,
24371 // * in the range [0, 31 x #SizeInBytes],
24372 // where #SizeInBytes is the size in bytes of the loaded items. For
24373 // immediates outside that range and non-immediate scalar offsets use
24374 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24375 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
24378 RetVT.getScalarSizeInBits() / 8)) {
24379 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24380 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24383 else
24384 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24387
24389 }
24390 }
24391
24392 auto &TLI = DAG.getTargetLoweringInfo();
24393 if (!TLI.isTypeLegal(Base.getValueType()))
24394 return SDValue();
24395
24396 // Some gather load variants allow unpacked offsets, but only as nxv2i32
24397 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24398 // nxv2i64. Legalize accordingly.
24399 if (!OnlyPackedOffsets &&
24400 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24401 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24402
24403 // Return value type that is representable in hardware
24404 EVT HwRetVt = getSVEContainerType(RetVT);
24405
24406 // Keep the original output value type around - this is needed to be able to
24407 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24408 // values we want the integer equivalent, so just use HwRetVT.
24409 SDValue OutVT = DAG.getValueType(RetVT);
24410 if (RetVT.isFloatingPoint())
24411 OutVT = DAG.getValueType(HwRetVt);
24412
24413 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
24414 SDValue Ops[] = {N->getOperand(0), // Chain
24415 N->getOperand(2), // Pg
24416 Base, Offset, OutVT};
24417
24418 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
24419 SDValue LoadChain = SDValue(Load.getNode(), 1);
24420
24421 if (RetVT.isInteger() && (RetVT != HwRetVt))
24422 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
24423
24424 // If the original return value was FP, bitcast accordingly. Doing it here
24425 // means that we can avoid adding TableGen patterns for FPs.
24426 if (RetVT.isFloatingPoint())
24427 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
24428
24429 return DAG.getMergeValues({Load, LoadChain}, DL);
24430}
24431
24432static SDValue
24434 SelectionDAG &DAG) {
24435 SDLoc DL(N);
24436 SDValue Src = N->getOperand(0);
24437 unsigned Opc = Src->getOpcode();
24438
24439 // Sign extend of an unsigned unpack -> signed unpack
24440 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24441
24442 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24444
24445 // Push the sign extend to the operand of the unpack
24446 // This is necessary where, for example, the operand of the unpack
24447 // is another unpack:
24448 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24449 // ->
24450 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24451 // ->
24452 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24453 SDValue ExtOp = Src->getOperand(0);
24454 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24455 EVT EltTy = VT.getVectorElementType();
24456 (void)EltTy;
24457
24458 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24459 "Sign extending from an invalid type");
24460
24461 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24462
24464 ExtOp, DAG.getValueType(ExtVT));
24465
24466 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24467 }
24468
24469 if (DCI.isBeforeLegalizeOps())
24470 return SDValue();
24471
24473 return SDValue();
24474
24475 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24476 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24477 unsigned NewOpc;
24478 unsigned MemVTOpNum = 4;
24479 switch (Opc) {
24482 MemVTOpNum = 3;
24483 break;
24486 MemVTOpNum = 3;
24487 break;
24490 MemVTOpNum = 3;
24491 break;
24494 break;
24497 break;
24500 break;
24503 break;
24506 break;
24509 break;
24512 break;
24515 break;
24518 break;
24521 break;
24524 break;
24527 break;
24530 break;
24533 break;
24536 break;
24537 default:
24538 return SDValue();
24539 }
24540
24541 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24542 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24543
24544 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24545 return SDValue();
24546
24547 EVT DstVT = N->getValueType(0);
24548 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24549
24551 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24552 Ops.push_back(Src->getOperand(I));
24553
24554 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24555 DCI.CombineTo(N, ExtLoad);
24556 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24557
24558 // Return N so it doesn't get rechecked
24559 return SDValue(N, 0);
24560}
24561
24562/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24563/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24564/// != nxv2i32) do not need legalization.
24566 const unsigned OffsetPos = 4;
24567 SDValue Offset = N->getOperand(OffsetPos);
24568
24569 // Not an unpacked vector, bail out.
24570 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24571 return SDValue();
24572
24573 // Extend the unpacked offset vector to 64-bit lanes.
24574 SDLoc DL(N);
24575 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24576 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24577 // Replace the offset operand with the 64-bit one.
24578 Ops[OffsetPos] = Offset;
24579
24580 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24581}
24582
24583/// Combines a node carrying the intrinsic
24584/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24585/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24586/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24587/// sve gather prefetch instruction with vector plus immediate addressing mode.
24589 unsigned ScalarSizeInBytes) {
24590 const unsigned ImmPos = 4, OffsetPos = 3;
24591 // No need to combine the node if the immediate is valid...
24592 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24593 return SDValue();
24594
24595 // ...otherwise swap the offset base with the offset...
24596 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24597 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24598 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24599 // `aarch64_sve_prfb_gather_uxtw_index`.
24600 SDLoc DL(N);
24601 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24602 MVT::i64);
24603
24604 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24605}
24606
24607// Return true if the vector operation can guarantee only the first lane of its
24608// result contains data, with all bits in other lanes set to zero.
24610 switch (Op.getOpcode()) {
24611 default:
24612 return false;
24628 return true;
24629 }
24630}
24631
24633 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24634 SDValue InsertVec = N->getOperand(0);
24635 SDValue InsertElt = N->getOperand(1);
24636 SDValue InsertIdx = N->getOperand(2);
24637
24638 // We only care about inserts into the first element...
24639 if (!isNullConstant(InsertIdx))
24640 return SDValue();
24641 // ...of a zero'd vector...
24643 return SDValue();
24644 // ...where the inserted data was previously extracted...
24645 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24646 return SDValue();
24647
24648 SDValue ExtractVec = InsertElt.getOperand(0);
24649 SDValue ExtractIdx = InsertElt.getOperand(1);
24650
24651 // ...from the first element of a vector.
24652 if (!isNullConstant(ExtractIdx))
24653 return SDValue();
24654
24655 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24656
24657 // Ensure there's no type conversion going on.
24658 if (N->getValueType(0) != ExtractVec.getValueType())
24659 return SDValue();
24660
24661 if (!isLanes1toNKnownZero(ExtractVec))
24662 return SDValue();
24663
24664 // The explicit zeroing is redundant.
24665 return ExtractVec;
24666}
24667
24668static SDValue
24671 return Res;
24672
24673 return performPostLD1Combine(N, DCI, true);
24674}
24675
24678 const AArch64Subtarget *Subtarget) {
24679 SDValue N0 = N->getOperand(0);
24680 EVT VT = N->getValueType(0);
24681
24682 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24683 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24684 return SDValue();
24685
24686 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24687 EVT EltVT = VT.getVectorElementType();
24688 return EltVT == MVT::f32 || EltVT == MVT::f64;
24689 };
24690
24691 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24692 // We purposefully don't care about legality of the nodes here as we know
24693 // they can be split down into something legal.
24694 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24695 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24696 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24697 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24698 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24699 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24700 LN0->getChain(), LN0->getBasePtr(),
24701 N0.getValueType(), LN0->getMemOperand());
24702 DCI.CombineTo(N, ExtLoad);
24703 DCI.CombineTo(
24704 N0.getNode(),
24705 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24706 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24707 ExtLoad.getValue(1));
24708 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24709 }
24710
24711 return SDValue();
24712}
24713
24715 const AArch64Subtarget *Subtarget) {
24716 EVT VT = N->getValueType(0);
24717
24718 // Don't expand for NEON, SVE2 or SME
24719 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24720 return SDValue();
24721
24722 SDLoc DL(N);
24723
24724 SDValue Mask = N->getOperand(0);
24725 SDValue In1 = N->getOperand(1);
24726 SDValue In2 = N->getOperand(2);
24727
24728 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24729 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24730 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24731 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24732}
24733
24735 EVT VT = N->getValueType(0);
24736
24737 SDValue Insert = N->getOperand(0);
24738 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24739 return SDValue();
24740
24741 if (!Insert.getOperand(0).isUndef())
24742 return SDValue();
24743
24744 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24745 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24746 if (IdxInsert != 0 || IdxDupLane != 0)
24747 return SDValue();
24748
24749 SDValue Bitcast = Insert.getOperand(1);
24750 if (Bitcast.getOpcode() != ISD::BITCAST)
24751 return SDValue();
24752
24753 SDValue Subvec = Bitcast.getOperand(0);
24754 EVT SubvecVT = Subvec.getValueType();
24755 if (!SubvecVT.is128BitVector())
24756 return SDValue();
24757 EVT NewSubvecVT =
24759
24760 SDLoc DL(N);
24761 SDValue NewInsert =
24762 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24763 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24764 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24765 NewInsert, N->getOperand(1));
24766 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24767}
24768
24769// Try to combine mull with uzp1.
24772 SelectionDAG &DAG) {
24773 if (DCI.isBeforeLegalizeOps())
24774 return SDValue();
24775
24776 SDValue LHS = N->getOperand(0);
24777 SDValue RHS = N->getOperand(1);
24778
24779 SDValue ExtractHigh;
24780 SDValue ExtractLow;
24781 SDValue TruncHigh;
24782 SDValue TruncLow;
24783 SDLoc DL(N);
24784
24785 // Check the operands are trunc and extract_high.
24787 RHS.getOpcode() == ISD::TRUNCATE) {
24788 TruncHigh = RHS;
24789 if (LHS.getOpcode() == ISD::BITCAST)
24790 ExtractHigh = LHS.getOperand(0);
24791 else
24792 ExtractHigh = LHS;
24794 LHS.getOpcode() == ISD::TRUNCATE) {
24795 TruncHigh = LHS;
24796 if (LHS.getOpcode() == ISD::BITCAST)
24797 ExtractHigh = RHS.getOperand(0);
24798 else
24799 ExtractHigh = RHS;
24800 } else
24801 return SDValue();
24802
24803 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24804 // with uzp1.
24805 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24806 SDValue TruncHighOp = TruncHigh.getOperand(0);
24807 EVT TruncHighOpVT = TruncHighOp.getValueType();
24808 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24809 DAG.isSplatValue(TruncHighOp, false))
24810 return SDValue();
24811
24812 // Check there is other extract_high with same source vector.
24813 // For example,
24814 //
24815 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24816 // t12: v4i16 = truncate t11
24817 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24818 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24819 // t16: v4i16 = truncate t15
24820 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24821 //
24822 // This dagcombine assumes the two extract_high uses same source vector in
24823 // order to detect the pair of the mull. If they have different source vector,
24824 // this code will not work.
24825 bool HasFoundMULLow = true;
24826 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24827 if (ExtractHighSrcVec->use_size() != 2)
24828 HasFoundMULLow = false;
24829
24830 // Find ExtractLow.
24831 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24832 if (User == ExtractHigh.getNode())
24833 continue;
24834
24835 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24837 HasFoundMULLow = false;
24838 break;
24839 }
24840
24841 ExtractLow.setNode(User);
24842 }
24843
24844 if (!ExtractLow || !ExtractLow->hasOneUse())
24845 HasFoundMULLow = false;
24846
24847 // Check ExtractLow's user.
24848 if (HasFoundMULLow) {
24849 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24850 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24851 HasFoundMULLow = false;
24852 } else {
24853 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24854 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24855 TruncLow = ExtractLowUser->getOperand(1);
24856 else
24857 HasFoundMULLow = false;
24858 } else {
24859 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24860 TruncLow = ExtractLowUser->getOperand(0);
24861 else
24862 HasFoundMULLow = false;
24863 }
24864 }
24865 }
24866
24867 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24868 // with uzp1.
24869 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24870 EVT TruncHighVT = TruncHigh.getValueType();
24871 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24872 SDValue TruncLowOp =
24873 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24874 EVT TruncLowOpVT = TruncLowOp.getValueType();
24875 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24876 DAG.isSplatValue(TruncLowOp, false)))
24877 return SDValue();
24878
24879 // Create uzp1, extract_high and extract_low.
24880 if (TruncHighOpVT != UZP1VT)
24881 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24882 if (TruncLowOpVT != UZP1VT)
24883 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24884
24885 SDValue UZP1 =
24886 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24887 SDValue HighIdxCst =
24888 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24889 SDValue NewTruncHigh =
24890 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24891 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24892
24893 if (HasFoundMULLow) {
24894 EVT TruncLowVT = TruncLow.getValueType();
24895 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24896 UZP1, ExtractLow.getOperand(1));
24897 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24898 }
24899
24900 return SDValue(N, 0);
24901}
24902
24905 SelectionDAG &DAG) {
24906 if (SDValue Val =
24908 return Val;
24909
24910 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24911 return Val;
24912
24913 return SDValue();
24914}
24915
24916static SDValue
24918 SelectionDAG &DAG) {
24919 // Let's do below transform.
24920 //
24921 // t34: v4i32 = AArch64ISD::UADDLV t2
24922 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24923 // t7: i64 = zero_extend t35
24924 // t20: v1i64 = scalar_to_vector t7
24925 // ==>
24926 // t34: v4i32 = AArch64ISD::UADDLV t2
24927 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24928 // t40: v1i64 = AArch64ISD::NVCAST t39
24929 if (DCI.isBeforeLegalizeOps())
24930 return SDValue();
24931
24932 EVT VT = N->getValueType(0);
24933 if (VT != MVT::v1i64)
24934 return SDValue();
24935
24936 SDValue ZEXT = N->getOperand(0);
24937 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24938 return SDValue();
24939
24940 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24941 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24942 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24943 return SDValue();
24944
24945 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24946 return SDValue();
24947
24948 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24949 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24950 UADDLV.getValueType() != MVT::v4i32 ||
24951 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24952 return SDValue();
24953
24954 // Let's generate new sequence with AArch64ISD::NVCAST.
24955 SDLoc DL(N);
24956 SDValue EXTRACT_SUBVEC =
24957 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24958 DAG.getConstant(0, DL, MVT::i64));
24959 SDValue NVCAST =
24960 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24961
24962 return NVCAST;
24963}
24964
24966 DAGCombinerInfo &DCI) const {
24967 SelectionDAG &DAG = DCI.DAG;
24968 switch (N->getOpcode()) {
24969 default:
24970 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24971 break;
24972 case ISD::VECREDUCE_AND:
24973 case ISD::VECREDUCE_OR:
24974 case ISD::VECREDUCE_XOR:
24975 return performVecReduceBitwiseCombine(N, DCI, DAG);
24976 case ISD::ADD:
24977 case ISD::SUB:
24978 return performAddSubCombine(N, DCI);
24979 case ISD::BUILD_VECTOR:
24980 return performBuildVectorCombine(N, DCI, DAG);
24981 case ISD::TRUNCATE:
24982 return performTruncateCombine(N, DAG);
24983 case AArch64ISD::ANDS:
24984 return performFlagSettingCombine(N, DCI, ISD::AND);
24985 case AArch64ISD::ADC:
24986 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24987 return R;
24988 return foldADCToCINC(N, DAG);
24989 case AArch64ISD::SBC:
24990 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24991 case AArch64ISD::ADCS:
24992 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24993 return R;
24995 case AArch64ISD::SBCS:
24996 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24997 return R;
24999 case AArch64ISD::BICi: {
25001 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
25002 APInt DemandedElts =
25003 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
25004
25006 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
25007 return SDValue();
25008
25009 break;
25010 }
25011 case ISD::XOR:
25012 return performXorCombine(N, DAG, DCI, Subtarget);
25013 case ISD::MUL:
25014 return performMulCombine(N, DAG, DCI, Subtarget);
25015 case ISD::SINT_TO_FP:
25016 case ISD::UINT_TO_FP:
25017 return performIntToFpCombine(N, DAG, Subtarget);
25018 case ISD::FP_TO_SINT:
25019 case ISD::FP_TO_UINT:
25022 return performFpToIntCombine(N, DAG, DCI, Subtarget);
25023 case ISD::OR:
25024 return performORCombine(N, DCI, Subtarget, *this);
25025 case ISD::AND:
25026 return performANDCombine(N, DCI);
25027 case ISD::FADD:
25028 return performFADDCombine(N, DCI);
25030 return performIntrinsicCombine(N, DCI, Subtarget);
25031 case ISD::ANY_EXTEND:
25032 case ISD::ZERO_EXTEND:
25033 case ISD::SIGN_EXTEND:
25034 return performExtendCombine(N, DCI, DAG);
25036 return performSignExtendInRegCombine(N, DCI, DAG);
25038 return performConcatVectorsCombine(N, DCI, DAG);
25040 return performExtractSubvectorCombine(N, DCI, DAG);
25042 return performInsertSubvectorCombine(N, DCI, DAG);
25043 case ISD::SELECT:
25044 return performSelectCombine(N, DCI);
25045 case ISD::VSELECT:
25046 return performVSelectCombine(N, DCI.DAG);
25047 case ISD::SETCC:
25048 return performSETCCCombine(N, DCI, DAG);
25049 case ISD::LOAD:
25050 return performLOADCombine(N, DCI, DAG, Subtarget);
25051 case ISD::STORE:
25052 return performSTORECombine(N, DCI, DAG, Subtarget);
25053 case ISD::MSTORE:
25054 return performMSTORECombine(N, DCI, DAG, Subtarget);
25055 case ISD::MGATHER:
25056 case ISD::MSCATTER:
25057 return performMaskedGatherScatterCombine(N, DCI, DAG);
25058 case ISD::FP_EXTEND:
25059 return performFPExtendCombine(N, DAG, DCI, Subtarget);
25060 case AArch64ISD::BRCOND:
25061 return performBRCONDCombine(N, DCI, DAG);
25062 case AArch64ISD::TBNZ:
25063 case AArch64ISD::TBZ:
25064 return performTBZCombine(N, DCI, DAG);
25065 case AArch64ISD::CSEL:
25066 return performCSELCombine(N, DCI, DAG);
25067 case AArch64ISD::DUP:
25072 return performDUPCombine(N, DCI);
25074 return performDupLane128Combine(N, DAG);
25075 case AArch64ISD::NVCAST:
25076 return performNVCASTCombine(N, DAG);
25077 case AArch64ISD::SPLICE:
25078 return performSpliceCombine(N, DAG);
25081 return performUnpackCombine(N, DAG, Subtarget);
25082 case AArch64ISD::UZP1:
25083 case AArch64ISD::UZP2:
25084 return performUzpCombine(N, DAG, Subtarget);
25086 return performSetccMergeZeroCombine(N, DCI);
25103 return performGLD1Combine(N, DAG);
25104 case AArch64ISD::VASHR:
25105 case AArch64ISD::VLSHR:
25106 return performVectorShiftCombine(N, *this, DCI);
25108 return performSunpkloCombine(N, DAG);
25109 case AArch64ISD::BSP:
25110 return performBSPExpandForSVE(N, DAG, Subtarget);
25112 return performInsertVectorEltCombine(N, DCI);
25114 return performExtractVectorEltCombine(N, DCI, Subtarget);
25115 case ISD::VECREDUCE_ADD:
25116 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
25117 case AArch64ISD::UADDV:
25118 return performUADDVCombine(N, DAG);
25119 case AArch64ISD::SMULL:
25120 case AArch64ISD::UMULL:
25121 case AArch64ISD::PMULL:
25122 return performMULLCombine(N, DCI, DAG);
25125 switch (N->getConstantOperandVal(1)) {
25126 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
25127 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
25128 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
25129 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
25130 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
25131 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
25132 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
25133 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
25134 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
25135 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
25136 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
25137 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
25138 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
25139 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
25140 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
25141 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
25143 case Intrinsic::aarch64_neon_ld2:
25144 case Intrinsic::aarch64_neon_ld3:
25145 case Intrinsic::aarch64_neon_ld4:
25146 case Intrinsic::aarch64_neon_ld1x2:
25147 case Intrinsic::aarch64_neon_ld1x3:
25148 case Intrinsic::aarch64_neon_ld1x4:
25149 case Intrinsic::aarch64_neon_ld2lane:
25150 case Intrinsic::aarch64_neon_ld3lane:
25151 case Intrinsic::aarch64_neon_ld4lane:
25152 case Intrinsic::aarch64_neon_ld2r:
25153 case Intrinsic::aarch64_neon_ld3r:
25154 case Intrinsic::aarch64_neon_ld4r:
25155 case Intrinsic::aarch64_neon_st2:
25156 case Intrinsic::aarch64_neon_st3:
25157 case Intrinsic::aarch64_neon_st4:
25158 case Intrinsic::aarch64_neon_st1x2:
25159 case Intrinsic::aarch64_neon_st1x3:
25160 case Intrinsic::aarch64_neon_st1x4:
25161 case Intrinsic::aarch64_neon_st2lane:
25162 case Intrinsic::aarch64_neon_st3lane:
25163 case Intrinsic::aarch64_neon_st4lane:
25164 return performNEONPostLDSTCombine(N, DCI, DAG);
25165 case Intrinsic::aarch64_sve_ldnt1:
25166 return performLDNT1Combine(N, DAG);
25167 case Intrinsic::aarch64_sve_ld1rq:
25168 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
25169 case Intrinsic::aarch64_sve_ld1ro:
25170 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
25171 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
25173 case Intrinsic::aarch64_sve_ldnt1_gather:
25175 case Intrinsic::aarch64_sve_ldnt1_gather_index:
25176 return performGatherLoadCombine(N, DAG,
25178 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
25180 case Intrinsic::aarch64_sve_ld1:
25182 case Intrinsic::aarch64_sve_ldnf1:
25184 case Intrinsic::aarch64_sve_ldff1:
25186 case Intrinsic::aarch64_sve_st1:
25187 return performST1Combine(N, DAG);
25188 case Intrinsic::aarch64_sve_stnt1:
25189 return performSTNT1Combine(N, DAG);
25190 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
25192 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
25194 case Intrinsic::aarch64_sve_stnt1_scatter:
25196 case Intrinsic::aarch64_sve_stnt1_scatter_index:
25198 case Intrinsic::aarch64_sve_ld1_gather:
25200 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
25201 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
25203 case Intrinsic::aarch64_sve_ld1q_gather_index:
25204 return performGatherLoadCombine(N, DAG,
25206 case Intrinsic::aarch64_sve_ld1_gather_index:
25207 return performGatherLoadCombine(N, DAG,
25209 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
25211 /*OnlyPackedOffsets=*/false);
25212 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
25214 /*OnlyPackedOffsets=*/false);
25215 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
25216 return performGatherLoadCombine(N, DAG,
25218 /*OnlyPackedOffsets=*/false);
25219 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
25220 return performGatherLoadCombine(N, DAG,
25222 /*OnlyPackedOffsets=*/false);
25223 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
25225 case Intrinsic::aarch64_sve_ldff1_gather:
25227 case Intrinsic::aarch64_sve_ldff1_gather_index:
25228 return performGatherLoadCombine(N, DAG,
25230 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
25231 return performGatherLoadCombine(N, DAG,
25233 /*OnlyPackedOffsets=*/false);
25234 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
25235 return performGatherLoadCombine(N, DAG,
25237 /*OnlyPackedOffsets=*/false);
25238 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
25239 return performGatherLoadCombine(N, DAG,
25241 /*OnlyPackedOffsets=*/false);
25242 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
25243 return performGatherLoadCombine(N, DAG,
25245 /*OnlyPackedOffsets=*/false);
25246 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
25247 return performGatherLoadCombine(N, DAG,
25249 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
25250 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
25252 case Intrinsic::aarch64_sve_st1q_scatter_index:
25254 case Intrinsic::aarch64_sve_st1_scatter:
25256 case Intrinsic::aarch64_sve_st1_scatter_index:
25258 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
25260 /*OnlyPackedOffsets=*/false);
25261 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
25263 /*OnlyPackedOffsets=*/false);
25264 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
25265 return performScatterStoreCombine(N, DAG,
25267 /*OnlyPackedOffsets=*/false);
25268 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
25269 return performScatterStoreCombine(N, DAG,
25271 /*OnlyPackedOffsets=*/false);
25272 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
25274 case Intrinsic::aarch64_rndr:
25275 case Intrinsic::aarch64_rndrrs: {
25276 unsigned IntrinsicID = N->getConstantOperandVal(1);
25277 auto Register =
25278 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
25279 : AArch64SysReg::RNDRRS);
25280 SDLoc DL(N);
25281 SDValue A = DAG.getNode(
25282 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
25283 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
25284 SDValue B = DAG.getNode(
25285 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
25286 DAG.getConstant(0, DL, MVT::i32),
25287 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
25288 return DAG.getMergeValues(
25289 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
25290 }
25291 case Intrinsic::aarch64_sme_ldr_zt:
25293 DAG.getVTList(MVT::Other), N->getOperand(0),
25294 N->getOperand(2), N->getOperand(3));
25295 case Intrinsic::aarch64_sme_str_zt:
25296 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
25297 DAG.getVTList(MVT::Other), N->getOperand(0),
25298 N->getOperand(2), N->getOperand(3));
25299 default:
25300 break;
25301 }
25302 break;
25303 case ISD::GlobalAddress:
25304 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
25305 case ISD::CTLZ:
25306 return performCTLZCombine(N, DAG, Subtarget);
25308 return performScalarToVectorCombine(N, DCI, DAG);
25309 }
25310 return SDValue();
25311}
25312
25313// Check if the return value is used as only a return value, as otherwise
25314// we can't perform a tail-call. In particular, we need to check for
25315// target ISD nodes that are returns and any other "odd" constructs
25316// that the generic analysis code won't necessarily catch.
25317bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
25318 SDValue &Chain) const {
25319 if (N->getNumValues() != 1)
25320 return false;
25321 if (!N->hasNUsesOfValue(1, 0))
25322 return false;
25323
25324 SDValue TCChain = Chain;
25325 SDNode *Copy = *N->use_begin();
25326 if (Copy->getOpcode() == ISD::CopyToReg) {
25327 // If the copy has a glue operand, we conservatively assume it isn't safe to
25328 // perform a tail call.
25329 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
25330 MVT::Glue)
25331 return false;
25332 TCChain = Copy->getOperand(0);
25333 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
25334 return false;
25335
25336 bool HasRet = false;
25337 for (SDNode *Node : Copy->uses()) {
25338 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
25339 return false;
25340 HasRet = true;
25341 }
25342
25343 if (!HasRet)
25344 return false;
25345
25346 Chain = TCChain;
25347 return true;
25348}
25349
25350// Return whether the an instruction can potentially be optimized to a tail
25351// call. This will cause the optimizers to attempt to move, or duplicate,
25352// return instructions to help enable tail call optimizations for this
25353// instruction.
25354bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
25355 return CI->isTailCall();
25356}
25357
25358bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
25359 Register Offset, bool IsPre,
25360 MachineRegisterInfo &MRI) const {
25361 auto CstOffset = getIConstantVRegVal(Offset, MRI);
25362 if (!CstOffset || CstOffset->isZero())
25363 return false;
25364
25365 // All of the indexed addressing mode instructions take a signed 9 bit
25366 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25367 // encodes the sign/indexing direction.
25368 return isInt<9>(CstOffset->getSExtValue());
25369}
25370
25371bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
25372 SDValue &Base,
25373 SDValue &Offset,
25374 SelectionDAG &DAG) const {
25375 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25376 return false;
25377
25378 // Non-null if there is exactly one user of the loaded value (ignoring chain).
25379 SDNode *ValOnlyUser = nullptr;
25380 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25381 ++UI) {
25382 if (UI.getUse().getResNo() == 1)
25383 continue; // Ignore chain.
25384 if (ValOnlyUser == nullptr)
25385 ValOnlyUser = *UI;
25386 else {
25387 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25388 break;
25389 }
25390 }
25391
25392 auto IsUndefOrZero = [](SDValue V) {
25393 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
25394 };
25395
25396 // If the only user of the value is a scalable vector splat, it is
25397 // preferable to do a replicating load (ld1r*).
25398 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
25399 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25400 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25401 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
25402 return false;
25403
25404 Base = Op->getOperand(0);
25405 // All of the indexed addressing mode instructions take a signed
25406 // 9 bit immediate offset.
25407 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
25408 int64_t RHSC = RHS->getSExtValue();
25409 if (Op->getOpcode() == ISD::SUB)
25410 RHSC = -(uint64_t)RHSC;
25411 if (!isInt<9>(RHSC))
25412 return false;
25413 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25414 // when dealing with subtraction.
25415 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25416 return true;
25417 }
25418 return false;
25419}
25420
25421bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25422 SDValue &Offset,
25424 SelectionDAG &DAG) const {
25425 EVT VT;
25426 SDValue Ptr;
25427 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25428 VT = LD->getMemoryVT();
25429 Ptr = LD->getBasePtr();
25430 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25431 VT = ST->getMemoryVT();
25432 Ptr = ST->getBasePtr();
25433 } else
25434 return false;
25435
25436 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25437 return false;
25438 AM = ISD::PRE_INC;
25439 return true;
25440}
25441
25442bool AArch64TargetLowering::getPostIndexedAddressParts(
25444 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25445 EVT VT;
25446 SDValue Ptr;
25447 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25448 VT = LD->getMemoryVT();
25449 Ptr = LD->getBasePtr();
25450 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25451 VT = ST->getMemoryVT();
25452 Ptr = ST->getBasePtr();
25453 } else
25454 return false;
25455
25456 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25457 return false;
25458 // Post-indexing updates the base, so it's not a valid transform
25459 // if that's not the same as the load's pointer.
25460 if (Ptr != Base)
25461 return false;
25462 AM = ISD::POST_INC;
25463 return true;
25464}
25465
25468 SelectionDAG &DAG) {
25469 SDLoc DL(N);
25470 SDValue Op = N->getOperand(0);
25471 EVT VT = N->getValueType(0);
25472 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25473 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25474 "Must be bool vector.");
25475
25476 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25477 // elements, it adds a vector concatenation with undef(s). If we encounter
25478 // this here, we can skip the concat.
25479 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25480 bool AllUndef = true;
25481 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25482 AllUndef &= Op.getOperand(I).isUndef();
25483
25484 if (AllUndef)
25485 Op = Op.getOperand(0);
25486 }
25487
25488 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25489 if (VectorBits)
25490 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25491}
25492
25495 SelectionDAG &DAG, EVT ExtendVT,
25496 EVT CastVT) {
25497 SDLoc DL(N);
25498 SDValue Op = N->getOperand(0);
25499 EVT VT = N->getValueType(0);
25500
25501 // Use SCALAR_TO_VECTOR for lane zero
25502 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25503 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25504 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25505 Results.push_back(
25506 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25507}
25508
25509void AArch64TargetLowering::ReplaceBITCASTResults(
25511 SDLoc DL(N);
25512 SDValue Op = N->getOperand(0);
25513 EVT VT = N->getValueType(0);
25514 EVT SrcVT = Op.getValueType();
25515
25516 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25517 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25518 return;
25519 }
25520
25521 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25522 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25523 return;
25524 }
25525
25526 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25527 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25528 return;
25529 }
25530
25531 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25532 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25533 "Expected fp->int bitcast!");
25534
25535 // Bitcasting between unpacked vector types of different element counts is
25536 // not a NOP because the live elements are laid out differently.
25537 // 01234567
25538 // e.g. nxv2i32 = XX??XX??
25539 // nxv4f16 = X?X?X?X?
25540 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25541 return;
25542
25543 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25544 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25545 return;
25546 }
25547
25548 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25549 !VT.isVector())
25550 return replaceBoolVectorBitcast(N, Results, DAG);
25551
25552 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25553 return;
25554
25555 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25556 DAG.getUNDEF(MVT::i32), Op);
25557 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25558 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25559}
25560
25562 SelectionDAG &DAG,
25563 const AArch64Subtarget *Subtarget) {
25564 EVT VT = N->getValueType(0);
25565 if (!VT.is256BitVector() ||
25567 !N->getFlags().hasAllowReassociation()) ||
25568 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25569 VT.getScalarType() == MVT::bf16)
25570 return;
25571
25572 SDValue X = N->getOperand(0);
25573 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25574 if (!Shuf) {
25575 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25576 X = N->getOperand(1);
25577 if (!Shuf)
25578 return;
25579 }
25580
25581 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25582 return;
25583
25584 // Check the mask is 1,0,3,2,5,4,...
25585 ArrayRef<int> Mask = Shuf->getMask();
25586 for (int I = 0, E = Mask.size(); I < E; I++)
25587 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25588 return;
25589
25590 SDLoc DL(N);
25591 auto LoHi = DAG.SplitVector(X, DL);
25592 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25593 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25594 LoHi.first, LoHi.second);
25595
25596 // Shuffle the elements back into order.
25597 SmallVector<int> NMask;
25598 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25599 NMask.push_back(I);
25600 NMask.push_back(I);
25601 }
25602 Results.push_back(
25603 DAG.getVectorShuffle(VT, DL,
25604 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25605 DAG.getUNDEF(LoHi.first.getValueType())),
25606 DAG.getUNDEF(VT), NMask));
25607}
25608
25611 SelectionDAG &DAG, unsigned InterOp,
25612 unsigned AcrossOp) {
25613 EVT LoVT, HiVT;
25614 SDValue Lo, Hi;
25615 SDLoc dl(N);
25616 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25617 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25618 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25619 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25620 Results.push_back(SplitVal);
25621}
25622
25623void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25625 SDValue In = N->getOperand(0);
25626 EVT InVT = In.getValueType();
25627
25628 // Common code will handle these just fine.
25629 if (!InVT.isScalableVector() || !InVT.isInteger())
25630 return;
25631
25632 SDLoc DL(N);
25633 EVT VT = N->getValueType(0);
25634
25635 // The following checks bail if this is not a halving operation.
25636
25638
25639 if (InVT.getVectorElementCount() != (ResEC * 2))
25640 return;
25641
25642 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25643 if (!CIndex)
25644 return;
25645
25646 unsigned Index = CIndex->getZExtValue();
25647 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25648 return;
25649
25650 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25651 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25652
25653 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25654 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25655}
25656
25657// Create an even/odd pair of X registers holding integer value V.
25659 SDLoc dl(V.getNode());
25660 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25661 if (DAG.getDataLayout().isBigEndian())
25662 std::swap (VLo, VHi);
25663 SDValue RegClass =
25664 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25665 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25666 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25667 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25668 return SDValue(
25669 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25670}
25671
25674 SelectionDAG &DAG,
25675 const AArch64Subtarget *Subtarget) {
25676 assert(N->getValueType(0) == MVT::i128 &&
25677 "AtomicCmpSwap on types less than 128 should be legal");
25678
25679 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25680 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25681 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25682 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25683 SDValue Ops[] = {
25684 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25685 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25686 N->getOperand(1), // Ptr
25687 N->getOperand(0), // Chain in
25688 };
25689
25690 unsigned Opcode;
25691 switch (MemOp->getMergedOrdering()) {
25693 Opcode = AArch64::CASPX;
25694 break;
25696 Opcode = AArch64::CASPAX;
25697 break;
25699 Opcode = AArch64::CASPLX;
25700 break;
25703 Opcode = AArch64::CASPALX;
25704 break;
25705 default:
25706 llvm_unreachable("Unexpected ordering!");
25707 }
25708
25709 MachineSDNode *CmpSwap = DAG.getMachineNode(
25710 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25711 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25712
25713 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25714 if (DAG.getDataLayout().isBigEndian())
25715 std::swap(SubReg1, SubReg2);
25716 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25717 SDValue(CmpSwap, 0));
25718 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25719 SDValue(CmpSwap, 0));
25720 Results.push_back(
25721 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25722 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25723 return;
25724 }
25725
25726 unsigned Opcode;
25727 switch (MemOp->getMergedOrdering()) {
25729 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25730 break;
25732 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25733 break;
25735 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25736 break;
25739 Opcode = AArch64::CMP_SWAP_128;
25740 break;
25741 default:
25742 llvm_unreachable("Unexpected ordering!");
25743 }
25744
25745 SDLoc DL(N);
25746 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25747 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25748 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25749 New.first, New.second, N->getOperand(0)};
25750 SDNode *CmpSwap = DAG.getMachineNode(
25751 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25752 Ops);
25753 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25754
25755 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25756 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25757 Results.push_back(SDValue(CmpSwap, 3));
25758}
25759
25760static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25761 AtomicOrdering Ordering) {
25762 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25763 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25764 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25765 // ATOMIC_LOAD_CLR at any point.
25766 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25767 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25768 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25769 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25770
25771 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25772 // The operand will need to be XORed in a separate step.
25773 switch (Ordering) {
25775 return AArch64::LDCLRP;
25776 break;
25778 return AArch64::LDCLRPA;
25779 break;
25781 return AArch64::LDCLRPL;
25782 break;
25785 return AArch64::LDCLRPAL;
25786 break;
25787 default:
25788 llvm_unreachable("Unexpected ordering!");
25789 }
25790 }
25791
25792 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25793 switch (Ordering) {
25795 return AArch64::LDSETP;
25796 break;
25798 return AArch64::LDSETPA;
25799 break;
25801 return AArch64::LDSETPL;
25802 break;
25805 return AArch64::LDSETPAL;
25806 break;
25807 default:
25808 llvm_unreachable("Unexpected ordering!");
25809 }
25810 }
25811
25812 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25813 switch (Ordering) {
25815 return AArch64::SWPP;
25816 break;
25818 return AArch64::SWPPA;
25819 break;
25821 return AArch64::SWPPL;
25822 break;
25825 return AArch64::SWPPAL;
25826 break;
25827 default:
25828 llvm_unreachable("Unexpected ordering!");
25829 }
25830 }
25831
25832 llvm_unreachable("Unexpected ISDOpcode!");
25833}
25834
25837 SelectionDAG &DAG,
25838 const AArch64Subtarget *Subtarget) {
25839 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25840 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25841 // rather than the CASP instructions, because CASP has register classes for
25842 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25843 // to present them as single operands. LSE128 instructions use the GPR64
25844 // register class (because the pair does not have to be sequential), like
25845 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25846
25847 assert(N->getValueType(0) == MVT::i128 &&
25848 "AtomicLoadXXX on types less than 128 should be legal");
25849
25850 if (!Subtarget->hasLSE128())
25851 return;
25852
25853 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25854 const SDValue &Chain = N->getOperand(0);
25855 const SDValue &Ptr = N->getOperand(1);
25856 const SDValue &Val128 = N->getOperand(2);
25857 std::pair<SDValue, SDValue> Val2x64 =
25858 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25859
25860 const unsigned ISDOpcode = N->getOpcode();
25861 const unsigned MachineOpcode =
25862 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25863
25864 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25865 SDLoc dl(Val128);
25866 Val2x64.first =
25867 DAG.getNode(ISD::XOR, dl, MVT::i64,
25868 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25869 Val2x64.second =
25870 DAG.getNode(ISD::XOR, dl, MVT::i64,
25871 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25872 }
25873
25874 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25875 if (DAG.getDataLayout().isBigEndian())
25876 std::swap(Ops[0], Ops[1]);
25877
25878 MachineSDNode *AtomicInst =
25879 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25880 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25881
25882 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25883
25884 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25885 if (DAG.getDataLayout().isBigEndian())
25886 std::swap(Lo, Hi);
25887
25888 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25889 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25890}
25891
25892void AArch64TargetLowering::ReplaceNodeResults(
25894 switch (N->getOpcode()) {
25895 default:
25896 llvm_unreachable("Don't know how to custom expand this");
25897 case ISD::BITCAST:
25898 ReplaceBITCASTResults(N, Results, DAG);
25899 return;
25900 case ISD::VECREDUCE_ADD:
25905 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25906 return;
25907 case ISD::ADD:
25908 case ISD::FADD:
25909 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25910 return;
25911
25912 case ISD::CTPOP:
25913 case ISD::PARITY:
25914 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25915 Results.push_back(Result);
25916 return;
25917 case AArch64ISD::SADDV:
25919 return;
25920 case AArch64ISD::UADDV:
25922 return;
25923 case AArch64ISD::SMINV:
25925 return;
25926 case AArch64ISD::UMINV:
25928 return;
25929 case AArch64ISD::SMAXV:
25931 return;
25932 case AArch64ISD::UMAXV:
25934 return;
25935 case ISD::MULHS:
25937 Results.push_back(
25938 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25939 return;
25940 case ISD::MULHU:
25942 Results.push_back(
25943 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25944 return;
25945 case ISD::FP_TO_UINT:
25946 case ISD::FP_TO_SINT:
25949 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25950 // Let normal code take care of it by not adding anything to Results.
25951 return;
25953 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25954 return;
25956 assert(N->getValueType(0) != MVT::i128 &&
25957 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25958 break;
25961 case ISD::ATOMIC_SWAP: {
25962 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25963 "Expected 128-bit atomicrmw.");
25964 // These need custom type legalisation so we go directly to instruction.
25965 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25966 return;
25967 }
25968 case ISD::ATOMIC_LOAD:
25969 case ISD::LOAD: {
25970 MemSDNode *LoadNode = cast<MemSDNode>(N);
25971 EVT MemVT = LoadNode->getMemoryVT();
25972 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25973 // targets.
25974 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25975 MemVT.getSizeInBits() == 256u &&
25976 (MemVT.getScalarSizeInBits() == 8u ||
25977 MemVT.getScalarSizeInBits() == 16u ||
25978 MemVT.getScalarSizeInBits() == 32u ||
25979 MemVT.getScalarSizeInBits() == 64u)) {
25980
25983 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25984 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25985 MVT::Other}),
25986 {LoadNode->getChain(), LoadNode->getBasePtr()},
25987 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25988
25989 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25990 Result.getValue(0), Result.getValue(1));
25991 Results.append({Pair, Result.getValue(2) /* Chain */});
25992 return;
25993 }
25994
25995 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25996 LoadNode->getMemoryVT() != MVT::i128) {
25997 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25998 // optimizer.
25999 return;
26000 }
26001
26002 if (SDValue(N, 0).getValueType() == MVT::i128) {
26003 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
26004 bool isLoadAcquire =
26006 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
26007
26008 if (isLoadAcquire)
26009 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
26010
26012 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
26013 {LoadNode->getChain(), LoadNode->getBasePtr()},
26014 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
26015
26016 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
26017
26018 SDValue Pair =
26019 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
26020 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
26021 Results.append({Pair, Result.getValue(2) /* Chain */});
26022 }
26023 return;
26024 }
26026 ReplaceExtractSubVectorResults(N, Results, DAG);
26027 return;
26030 // Custom lowering has been requested for INSERT_SUBVECTOR and
26031 // CONCAT_VECTORS -- but delegate to common code for result type
26032 // legalisation
26033 return;
26035 EVT VT = N->getValueType(0);
26036
26037 Intrinsic::ID IntID =
26038 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
26039 switch (IntID) {
26040 default:
26041 return;
26042 case Intrinsic::aarch64_sve_clasta_n: {
26043 assert((VT == MVT::i8 || VT == MVT::i16) &&
26044 "custom lowering for unexpected type");
26045 SDLoc DL(N);
26046 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
26047 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
26048 N->getOperand(1), Op2, N->getOperand(3));
26049 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26050 return;
26051 }
26052 case Intrinsic::aarch64_sve_clastb_n: {
26053 assert((VT == MVT::i8 || VT == MVT::i16) &&
26054 "custom lowering for unexpected type");
26055 SDLoc DL(N);
26056 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
26057 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
26058 N->getOperand(1), Op2, N->getOperand(3));
26059 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26060 return;
26061 }
26062 case Intrinsic::aarch64_sve_lasta: {
26063 assert((VT == MVT::i8 || VT == MVT::i16) &&
26064 "custom lowering for unexpected type");
26065 SDLoc DL(N);
26066 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
26067 N->getOperand(1), N->getOperand(2));
26068 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26069 return;
26070 }
26071 case Intrinsic::aarch64_sve_lastb: {
26072 assert((VT == MVT::i8 || VT == MVT::i16) &&
26073 "custom lowering for unexpected type");
26074 SDLoc DL(N);
26075 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
26076 N->getOperand(1), N->getOperand(2));
26077 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26078 return;
26079 }
26080 case Intrinsic::get_active_lane_mask: {
26081 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
26082 return;
26083
26084 // NOTE: Only trivial type promotion is supported.
26085 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
26086 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
26087 return;
26088
26089 SDLoc DL(N);
26090 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
26091 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26092 return;
26093 }
26094 }
26095 }
26096 case ISD::READ_REGISTER: {
26097 SDLoc DL(N);
26098 assert(N->getValueType(0) == MVT::i128 &&
26099 "READ_REGISTER custom lowering is only for 128-bit sysregs");
26100 SDValue Chain = N->getOperand(0);
26101 SDValue SysRegName = N->getOperand(1);
26102
26103 SDValue Result = DAG.getNode(
26104 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
26105 Chain, SysRegName);
26106
26107 // Sysregs are not endian. Result.getValue(0) always contains the lower half
26108 // of the 128-bit System Register value.
26109 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
26110 Result.getValue(0), Result.getValue(1));
26111 Results.push_back(Pair);
26112 Results.push_back(Result.getValue(2)); // Chain
26113 return;
26114 }
26115 }
26116}
26117
26119 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
26121 return true;
26122}
26123
26124unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
26125 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
26126 // reciprocal if there are three or more FDIVs.
26127 return 3;
26128}
26129
26132 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
26133 // v4i16, v2i32 instead of to promote.
26134 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
26135 VT == MVT::v1f32)
26136 return TypeWidenVector;
26137
26139}
26140
26141// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
26142// provided the address is 16-byte aligned.
26144 if (!Subtarget->hasLSE2())
26145 return false;
26146
26147 if (auto LI = dyn_cast<LoadInst>(I))
26148 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26149 LI->getAlign() >= Align(16);
26150
26151 if (auto SI = dyn_cast<StoreInst>(I))
26152 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26153 SI->getAlign() >= Align(16);
26154
26155 return false;
26156}
26157
26159 if (!Subtarget->hasLSE128())
26160 return false;
26161
26162 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
26163 // will clobber the two registers.
26164 if (const auto *SI = dyn_cast<StoreInst>(I))
26165 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26166 SI->getAlign() >= Align(16) &&
26167 (SI->getOrdering() == AtomicOrdering::Release ||
26168 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
26169
26170 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
26171 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26172 RMW->getAlign() >= Align(16) &&
26173 (RMW->getOperation() == AtomicRMWInst::Xchg ||
26174 RMW->getOperation() == AtomicRMWInst::And ||
26175 RMW->getOperation() == AtomicRMWInst::Or);
26176
26177 return false;
26178}
26179
26181 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
26182 return false;
26183
26184 if (auto LI = dyn_cast<LoadInst>(I))
26185 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26186 LI->getAlign() >= Align(16) &&
26187 LI->getOrdering() == AtomicOrdering::Acquire;
26188
26189 if (auto SI = dyn_cast<StoreInst>(I))
26190 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26191 SI->getAlign() >= Align(16) &&
26192 SI->getOrdering() == AtomicOrdering::Release;
26193
26194 return false;
26195}
26196
26198 const Instruction *I) const {
26200 return false;
26202 return false;
26204 return true;
26205 return false;
26206}
26207
26209 const Instruction *I) const {
26210 // Store-Release instructions only provide seq_cst guarantees when paired with
26211 // Load-Acquire instructions. MSVC CRT does not use these instructions to
26212 // implement seq_cst loads and stores, so we need additional explicit fences
26213 // after memory writes.
26214 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26215 return false;
26216
26217 switch (I->getOpcode()) {
26218 default:
26219 return false;
26220 case Instruction::AtomicCmpXchg:
26221 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
26223 case Instruction::AtomicRMW:
26224 return cast<AtomicRMWInst>(I)->getOrdering() ==
26226 case Instruction::Store:
26227 return cast<StoreInst>(I)->getOrdering() ==
26229 }
26230}
26231
26232// Loads and stores less than 128-bits are already atomic; ones above that
26233// are doomed anyway, so defer to the default libcall and blame the OS when
26234// things go wrong.
26237 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
26238 if (Size != 128)
26240 if (isOpSuitableForRCPC3(SI))
26242 if (isOpSuitableForLSE128(SI))
26244 if (isOpSuitableForLDPSTP(SI))
26247}
26248
26249// Loads and stores less than 128-bits are already atomic; ones above that
26250// are doomed anyway, so defer to the default libcall and blame the OS when
26251// things go wrong.
26254 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
26255
26256 if (Size != 128)
26258 if (isOpSuitableForRCPC3(LI))
26260 // No LSE128 loads
26261 if (isOpSuitableForLDPSTP(LI))
26263
26264 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26265 // implement atomicrmw without spilling. If the target address is also on the
26266 // stack and close enough to the spill slot, this can lead to a situation
26267 // where the monitor always gets cleared and the atomic operation can never
26268 // succeed. So at -O0 lower this operation to a CAS loop.
26269 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26271
26272 // Using CAS for an atomic load has a better chance of succeeding under high
26273 // contention situations. So use it if available.
26274 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
26276}
26277
26278// The "default" for integer RMW operations is to expand to an LL/SC loop.
26279// However, with the LSE instructions (or outline-atomics mode, which provides
26280// library routines in place of the LSE-instructions), we can directly emit many
26281// operations instead.
26282//
26283// Floating-point operations are always emitted to a cmpxchg loop, because they
26284// may trigger a trap which aborts an LLSC sequence.
26287 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
26288 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
26289
26290 if (AI->isFloatingPointOperation())
26292
26293 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
26297 if (CanUseLSE128)
26299
26300 // Nand is not supported in LSE.
26301 // Leave 128 bits to LLSC or CmpXChg.
26302 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
26303 if (Subtarget->hasLSE())
26305 if (Subtarget->outlineAtomics()) {
26306 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
26307 // Don't outline them unless
26308 // (1) high level <atomic> support approved:
26309 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
26310 // (2) low level libgcc and compiler-rt support implemented by:
26311 // min/max outline atomics helpers
26312 if (AI->getOperation() != AtomicRMWInst::Min &&
26317 }
26318 }
26319 }
26320
26321 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26322 // implement atomicrmw without spilling. If the target address is also on the
26323 // stack and close enough to the spill slot, this can lead to a situation
26324 // where the monitor always gets cleared and the atomic operation can never
26325 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
26326 // we have a single CAS instruction that can replace the loop.
26328 Subtarget->hasLSE())
26330
26332}
26333
26336 AtomicCmpXchgInst *AI) const {
26337 // If subtarget has LSE, leave cmpxchg intact for codegen.
26338 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
26340 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26341 // implement cmpxchg without spilling. If the address being exchanged is also
26342 // on the stack and close enough to the spill slot, this can lead to a
26343 // situation where the monitor always gets cleared and the atomic operation
26344 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
26345 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26347
26348 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
26349 // it.
26351 if (Size > 64)
26353
26355}
26356
26358 Type *ValueTy, Value *Addr,
26359 AtomicOrdering Ord) const {
26360 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26361 bool IsAcquire = isAcquireOrStronger(Ord);
26362
26363 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26364 // intrinsic must return {i64, i64} and we have to recombine them into a
26365 // single i128 here.
26366 if (ValueTy->getPrimitiveSizeInBits() == 128) {
26368 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26370
26371 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
26372
26373 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
26374 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
26375 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
26376 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
26377 return Builder.CreateOr(
26378 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
26379 }
26380
26381 Type *Tys[] = { Addr->getType() };
26383 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26384 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
26385
26386 const DataLayout &DL = M->getDataLayout();
26387 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
26388 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
26389 CI->addParamAttr(
26390 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
26391 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
26392
26393 return Builder.CreateBitCast(Trunc, ValueTy);
26394}
26395
26397 IRBuilderBase &Builder) const {
26398 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26399 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
26400}
26401
26403 Value *Val, Value *Addr,
26404 AtomicOrdering Ord) const {
26405 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26406 bool IsRelease = isReleaseOrStronger(Ord);
26407
26408 // Since the intrinsics must have legal type, the i128 intrinsics take two
26409 // parameters: "i64, i64". We must marshal Val into the appropriate form
26410 // before the call.
26411 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26413 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26415 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26416
26417 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26418 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26419 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26420 }
26421
26423 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26424 Type *Tys[] = { Addr->getType() };
26425 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26426
26427 const DataLayout &DL = M->getDataLayout();
26428 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26429 Val = Builder.CreateBitCast(Val, IntValTy);
26430
26431 CallInst *CI = Builder.CreateCall(
26432 Stxr, {Builder.CreateZExtOrBitCast(
26433 Val, Stxr->getFunctionType()->getParamType(0)),
26434 Addr});
26435 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26436 Attribute::ElementType, Val->getType()));
26437 return CI;
26438}
26439
26441 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26442 const DataLayout &DL) const {
26443 if (!Ty->isArrayTy()) {
26444 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26445 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26446 }
26447
26448 // All non aggregate members of the type must have the same type
26449 SmallVector<EVT> ValueVTs;
26450 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26451 return all_equal(ValueVTs);
26452}
26453
26454bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26455 EVT) const {
26456 return false;
26457}
26458
26459static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26460 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26461 Function *ThreadPointerFunc =
26462 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26463 return IRB.CreatePointerCast(
26464 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26465 Offset),
26466 IRB.getPtrTy(0));
26467}
26468
26470 // Android provides a fixed TLS slot for the stack cookie. See the definition
26471 // of TLS_SLOT_STACK_GUARD in
26472 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26473 if (Subtarget->isTargetAndroid())
26474 return UseTlsOffset(IRB, 0x28);
26475
26476 // Fuchsia is similar.
26477 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26478 if (Subtarget->isTargetFuchsia())
26479 return UseTlsOffset(IRB, -0x10);
26480
26482}
26483
26485 // MSVC CRT provides functionalities for stack protection.
26486 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26487 // MSVC CRT has a global variable holding security cookie.
26488 M.getOrInsertGlobal("__security_cookie",
26489 PointerType::getUnqual(M.getContext()));
26490
26491 // MSVC CRT has a function to validate security cookie.
26492 FunctionCallee SecurityCheckCookie =
26493 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26494 Type::getVoidTy(M.getContext()),
26495 PointerType::getUnqual(M.getContext()));
26496 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26497 F->setCallingConv(CallingConv::Win64);
26498 F->addParamAttr(0, Attribute::AttrKind::InReg);
26499 }
26500 return;
26501 }
26503}
26504
26506 // MSVC CRT has a global variable holding security cookie.
26507 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26508 return M.getGlobalVariable("__security_cookie");
26510}
26511
26513 // MSVC CRT has a function to validate security cookie.
26514 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26515 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26517}
26518
26519Value *
26521 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26522 // definition of TLS_SLOT_SAFESTACK in
26523 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26524 if (Subtarget->isTargetAndroid())
26525 return UseTlsOffset(IRB, 0x48);
26526
26527 // Fuchsia is similar.
26528 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26529 if (Subtarget->isTargetFuchsia())
26530 return UseTlsOffset(IRB, -0x8);
26531
26533}
26534
26536 const Instruction &AndI) const {
26537 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26538 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26539 // may be beneficial to sink in other cases, but we would have to check that
26540 // the cmp would not get folded into the br to form a cbz for these to be
26541 // beneficial.
26542 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26543 if (!Mask)
26544 return false;
26545 return Mask->getValue().isPowerOf2();
26546}
26547
26551 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26552 SelectionDAG &DAG) const {
26553 // Does baseline recommend not to perform the fold by default?
26555 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26556 return false;
26557 // Else, if this is a vector shift, prefer 'shl'.
26558 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26559}
26560
26563 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26565 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26568 ExpansionFactor);
26569}
26570
26572 // Update IsSplitCSR in AArch64unctionInfo.
26573 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26574 AFI->setIsSplitCSR(true);
26575}
26576
26578 MachineBasicBlock *Entry,
26579 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26580 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26581 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26582 if (!IStart)
26583 return;
26584
26585 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26586 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26587 MachineBasicBlock::iterator MBBI = Entry->begin();
26588 for (const MCPhysReg *I = IStart; *I; ++I) {
26589 const TargetRegisterClass *RC = nullptr;
26590 if (AArch64::GPR64RegClass.contains(*I))
26591 RC = &AArch64::GPR64RegClass;
26592 else if (AArch64::FPR64RegClass.contains(*I))
26593 RC = &AArch64::FPR64RegClass;
26594 else
26595 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26596
26597 Register NewVR = MRI->createVirtualRegister(RC);
26598 // Create copy from CSR to a virtual register.
26599 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26600 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26601 // nounwind. If we want to generalize this later, we may need to emit
26602 // CFI pseudo-instructions.
26603 assert(Entry->getParent()->getFunction().hasFnAttribute(
26604 Attribute::NoUnwind) &&
26605 "Function should be nounwind in insertCopiesSplitCSR!");
26606 Entry->addLiveIn(*I);
26607 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26608 .addReg(*I);
26609
26610 // Insert the copy-back instructions right before the terminator.
26611 for (auto *Exit : Exits)
26612 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26613 TII->get(TargetOpcode::COPY), *I)
26614 .addReg(NewVR);
26615 }
26616}
26617
26619 // Integer division on AArch64 is expensive. However, when aggressively
26620 // optimizing for code size, we prefer to use a div instruction, as it is
26621 // usually smaller than the alternative sequence.
26622 // The exception to this is vector division. Since AArch64 doesn't have vector
26623 // integer division, leaving the division as-is is a loss even in terms of
26624 // size, because it will have to be scalarized, while the alternative code
26625 // sequence can be performed in vector form.
26626 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26627 return OptSize && !VT.isVector();
26628}
26629
26631 // We want inc-of-add for scalars and sub-of-not for vectors.
26632 return VT.isScalarInteger();
26633}
26634
26636 EVT VT) const {
26637 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26638 // legalize.
26639 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26640 return false;
26641 if (FPVT == MVT::v8bf16)
26642 return false;
26643 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26644}
26645
26649 const TargetInstrInfo *TII) const {
26650 assert(MBBI->isCall() && MBBI->getCFIType() &&
26651 "Invalid call instruction for a KCFI check");
26652
26653 switch (MBBI->getOpcode()) {
26654 case AArch64::BLR:
26655 case AArch64::BLRNoIP:
26656 case AArch64::TCRETURNri:
26657 case AArch64::TCRETURNrix16x17:
26658 case AArch64::TCRETURNrix17:
26659 case AArch64::TCRETURNrinotx16:
26660 break;
26661 default:
26662 llvm_unreachable("Unexpected CFI call opcode");
26663 }
26664
26665 MachineOperand &Target = MBBI->getOperand(0);
26666 assert(Target.isReg() && "Invalid target operand for an indirect call");
26667 Target.setIsRenamable(false);
26668
26669 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26670 .addReg(Target.getReg())
26671 .addImm(MBBI->getCFIType())
26672 .getInstr();
26673}
26674
26676 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26677}
26678
26679unsigned
26681 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26682 return getPointerTy(DL).getSizeInBits();
26683
26684 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26685}
26686
26687void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26688 MachineFrameInfo &MFI = MF.getFrameInfo();
26689 // If we have any vulnerable SVE stack objects then the stack protector
26690 // needs to be placed at the top of the SVE stack area, as the SVE locals
26691 // are placed above the other locals, so we allocate it as if it were a
26692 // scalable vector.
26693 // FIXME: It may be worthwhile having a specific interface for this rather
26694 // than doing it here in finalizeLowering.
26695 if (MFI.hasStackProtectorIndex()) {
26696 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26702 break;
26703 }
26704 }
26705 }
26708}
26709
26710// Unlike X86, we let frame lowering assign offsets to all catch objects.
26712 return false;
26713}
26714
26715bool AArch64TargetLowering::shouldLocalize(
26716 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26717 auto &MF = *MI.getMF();
26718 auto &MRI = MF.getRegInfo();
26719 auto maxUses = [](unsigned RematCost) {
26720 // A cost of 1 means remats are basically free.
26721 if (RematCost == 1)
26722 return std::numeric_limits<unsigned>::max();
26723 if (RematCost == 2)
26724 return 2U;
26725
26726 // Remat is too expensive, only sink if there's one user.
26727 if (RematCost > 2)
26728 return 1U;
26729 llvm_unreachable("Unexpected remat cost");
26730 };
26731
26732 unsigned Opc = MI.getOpcode();
26733 switch (Opc) {
26734 case TargetOpcode::G_GLOBAL_VALUE: {
26735 // On Darwin, TLS global vars get selected into function calls, which
26736 // we don't want localized, as they can get moved into the middle of a
26737 // another call sequence.
26738 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26739 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26740 return false;
26741 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26742 }
26743 case TargetOpcode::G_FCONSTANT:
26744 case TargetOpcode::G_CONSTANT: {
26745 const ConstantInt *CI;
26746 unsigned AdditionalCost = 0;
26747
26748 if (Opc == TargetOpcode::G_CONSTANT)
26749 CI = MI.getOperand(1).getCImm();
26750 else {
26751 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26752 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26753 // materialized as integers.
26754 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26755 break;
26756 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26757 bool OptForSize =
26760 OptForSize))
26761 return true; // Constant should be cheap.
26762 CI =
26763 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26764 // FP materialization also costs an extra move, from gpr to fpr.
26765 AdditionalCost = 1;
26766 }
26767 APInt Imm = CI->getValue();
26770 assert(Cost.isValid() && "Expected a valid imm cost");
26771
26772 unsigned RematCost = *Cost.getValue();
26773 RematCost += AdditionalCost;
26774 Register Reg = MI.getOperand(0).getReg();
26775 unsigned MaxUses = maxUses(RematCost);
26776 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26777 if (MaxUses == std::numeric_limits<unsigned>::max())
26778 --MaxUses;
26779 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26780 }
26781 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26782 // localizable.
26783 case AArch64::ADRP:
26784 case AArch64::G_ADD_LOW:
26785 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26786 case TargetOpcode::G_PTR_ADD:
26787 return true;
26788 default:
26789 break;
26790 }
26792}
26793
26795 // Fallback for scalable vectors.
26796 // Note that if EnableSVEGISel is true, we allow scalable vector types for
26797 // all instructions, regardless of whether they are actually supported.
26798 if (!EnableSVEGISel) {
26799 if (Inst.getType()->isScalableTy()) {
26800 return true;
26801 }
26802
26803 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26804 if (Inst.getOperand(i)->getType()->isScalableTy())
26805 return true;
26806
26807 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26808 if (AI->getAllocatedType()->isScalableTy())
26809 return true;
26810 }
26811 }
26812
26813 // Checks to allow the use of SME instructions
26814 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26815 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26816 auto CalleeAttrs = SMEAttrs(*Base);
26817 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26818 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26819 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26820 return true;
26821 }
26822 return false;
26823}
26824
26825// Return the largest legal scalable vector type that matches VT's element type.
26829 "Expected legal fixed length vector!");
26830 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26831 default:
26832 llvm_unreachable("unexpected element type for SVE container");
26833 case MVT::i8:
26834 return EVT(MVT::nxv16i8);
26835 case MVT::i16:
26836 return EVT(MVT::nxv8i16);
26837 case MVT::i32:
26838 return EVT(MVT::nxv4i32);
26839 case MVT::i64:
26840 return EVT(MVT::nxv2i64);
26841 case MVT::bf16:
26842 return EVT(MVT::nxv8bf16);
26843 case MVT::f16:
26844 return EVT(MVT::nxv8f16);
26845 case MVT::f32:
26846 return EVT(MVT::nxv4f32);
26847 case MVT::f64:
26848 return EVT(MVT::nxv2f64);
26849 }
26850}
26851
26852// Return a PTRUE with active lanes corresponding to the extent of VT.
26854 EVT VT) {
26857 "Expected legal fixed length vector!");
26858
26859 std::optional<unsigned> PgPattern =
26861 assert(PgPattern && "Unexpected element count for SVE predicate");
26862
26863 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26864 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26865 // variants of instructions when available.
26866 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26867 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26868 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26869 if (MaxSVESize && MinSVESize == MaxSVESize &&
26870 MaxSVESize == VT.getSizeInBits())
26871 PgPattern = AArch64SVEPredPattern::all;
26872
26873 MVT MaskVT;
26874 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26875 default:
26876 llvm_unreachable("unexpected element type for SVE predicate");
26877 case MVT::i8:
26878 MaskVT = MVT::nxv16i1;
26879 break;
26880 case MVT::i16:
26881 case MVT::f16:
26882 case MVT::bf16:
26883 MaskVT = MVT::nxv8i1;
26884 break;
26885 case MVT::i32:
26886 case MVT::f32:
26887 MaskVT = MVT::nxv4i1;
26888 break;
26889 case MVT::i64:
26890 case MVT::f64:
26891 MaskVT = MVT::nxv2i1;
26892 break;
26893 }
26894
26895 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26896}
26897
26899 EVT VT) {
26901 "Expected legal scalable vector!");
26902 auto PredTy = VT.changeVectorElementType(MVT::i1);
26903 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26904}
26905
26907 if (VT.isFixedLengthVector())
26908 return getPredicateForFixedLengthVector(DAG, DL, VT);
26909
26910 return getPredicateForScalableVector(DAG, DL, VT);
26911}
26912
26913// Grow V to consume an entire SVE register.
26915 assert(VT.isScalableVector() &&
26916 "Expected to convert into a scalable vector!");
26917 assert(V.getValueType().isFixedLengthVector() &&
26918 "Expected a fixed length vector operand!");
26919 SDLoc DL(V);
26920 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26921 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26922}
26923
26924// Shrink V so it's just big enough to maintain a VT's worth of data.
26927 "Expected to convert into a fixed length vector!");
26928 assert(V.getValueType().isScalableVector() &&
26929 "Expected a scalable vector operand!");
26930 SDLoc DL(V);
26931 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26932 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26933}
26934
26935// Convert all fixed length vector loads larger than NEON to masked_loads.
26936SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26937 SDValue Op, SelectionDAG &DAG) const {
26938 auto Load = cast<LoadSDNode>(Op);
26939
26940 SDLoc DL(Op);
26941 EVT VT = Op.getValueType();
26942 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26943 EVT LoadVT = ContainerVT;
26944 EVT MemVT = Load->getMemoryVT();
26945
26946 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26947
26948 if (VT.isFloatingPoint()) {
26949 LoadVT = ContainerVT.changeTypeToInteger();
26950 MemVT = MemVT.changeTypeToInteger();
26951 }
26952
26953 SDValue NewLoad = DAG.getMaskedLoad(
26954 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26955 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26956 Load->getAddressingMode(), Load->getExtensionType());
26957
26958 SDValue Result = NewLoad;
26959 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26960 EVT ExtendVT = ContainerVT.changeVectorElementType(
26961 Load->getMemoryVT().getVectorElementType());
26962
26963 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26965 Pg, Result, DAG.getUNDEF(ContainerVT));
26966 } else if (VT.isFloatingPoint()) {
26967 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26968 }
26969
26970 Result = convertFromScalableVector(DAG, VT, Result);
26971 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26972 return DAG.getMergeValues(MergedValues, DL);
26973}
26974
26976 SelectionDAG &DAG) {
26977 SDLoc DL(Mask);
26978 EVT InVT = Mask.getValueType();
26979 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26980
26981 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26982
26983 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26984 return Pg;
26985
26986 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26987 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26988
26990 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26991}
26992
26993// Convert all fixed length vector loads larger than NEON to masked_loads.
26994SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26995 SDValue Op, SelectionDAG &DAG) const {
26996 auto Load = cast<MaskedLoadSDNode>(Op);
26997
26998 SDLoc DL(Op);
26999 EVT VT = Op.getValueType();
27000 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27001
27002 SDValue Mask = Load->getMask();
27003 // If this is an extending load and the mask type is not the same as
27004 // load's type then we have to extend the mask type.
27005 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
27006 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
27007 "Incorrect mask type");
27008 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
27009 }
27011
27012 SDValue PassThru;
27013 bool IsPassThruZeroOrUndef = false;
27014
27015 if (Load->getPassThru()->isUndef()) {
27016 PassThru = DAG.getUNDEF(ContainerVT);
27017 IsPassThruZeroOrUndef = true;
27018 } else {
27019 if (ContainerVT.isInteger())
27020 PassThru = DAG.getConstant(0, DL, ContainerVT);
27021 else
27022 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
27023 if (isZerosVector(Load->getPassThru().getNode()))
27024 IsPassThruZeroOrUndef = true;
27025 }
27026
27027 SDValue NewLoad = DAG.getMaskedLoad(
27028 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
27029 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
27030 Load->getAddressingMode(), Load->getExtensionType());
27031
27032 SDValue Result = NewLoad;
27033 if (!IsPassThruZeroOrUndef) {
27034 SDValue OldPassThru =
27035 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
27036 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
27037 }
27038
27039 Result = convertFromScalableVector(DAG, VT, Result);
27040 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
27041 return DAG.getMergeValues(MergedValues, DL);
27042}
27043
27044// Convert all fixed length vector stores larger than NEON to masked_stores.
27045SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
27046 SDValue Op, SelectionDAG &DAG) const {
27047 auto Store = cast<StoreSDNode>(Op);
27048
27049 SDLoc DL(Op);
27050 EVT VT = Store->getValue().getValueType();
27051 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27052 EVT MemVT = Store->getMemoryVT();
27053
27054 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27055 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
27056
27057 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
27058 EVT TruncVT = ContainerVT.changeVectorElementType(
27059 Store->getMemoryVT().getVectorElementType());
27060 MemVT = MemVT.changeTypeToInteger();
27061 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
27062 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
27063 DAG.getUNDEF(TruncVT));
27064 NewValue =
27065 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
27066 } else if (VT.isFloatingPoint()) {
27067 MemVT = MemVT.changeTypeToInteger();
27068 NewValue =
27069 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
27070 }
27071
27072 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
27073 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
27074 Store->getMemOperand(), Store->getAddressingMode(),
27075 Store->isTruncatingStore());
27076}
27077
27078SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
27079 SDValue Op, SelectionDAG &DAG) const {
27080 auto *Store = cast<MaskedStoreSDNode>(Op);
27081
27082 SDLoc DL(Op);
27083 EVT VT = Store->getValue().getValueType();
27084 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27085
27086 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
27088
27089 return DAG.getMaskedStore(
27090 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
27091 Mask, Store->getMemoryVT(), Store->getMemOperand(),
27092 Store->getAddressingMode(), Store->isTruncatingStore());
27093}
27094
27095SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
27096 SDValue Op, SelectionDAG &DAG) const {
27097 SDLoc dl(Op);
27098 EVT VT = Op.getValueType();
27099 EVT EltVT = VT.getVectorElementType();
27100
27101 bool Signed = Op.getOpcode() == ISD::SDIV;
27102 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
27103
27104 bool Negated;
27105 uint64_t SplatVal;
27106 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
27107 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27108 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27109 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
27110
27111 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
27112 SDValue Res =
27113 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
27114 if (Negated)
27115 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
27116 DAG.getConstant(0, dl, ContainerVT), Res);
27117
27118 return convertFromScalableVector(DAG, VT, Res);
27119 }
27120
27121 // Scalable vector i32/i64 DIV is supported.
27122 if (EltVT == MVT::i32 || EltVT == MVT::i64)
27123 return LowerToPredicatedOp(Op, DAG, PredOpcode);
27124
27125 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
27126 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
27127 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
27128 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27129
27130 // If the wider type is legal: extend, op, and truncate.
27131 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27132 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
27133 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
27134 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
27135 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
27136 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
27137 }
27138
27139 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
27140 &ExtendOpcode](SDValue Op) {
27141 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
27142 SDValue IdxHalf =
27143 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
27144 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
27145 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
27146 return std::pair<SDValue, SDValue>(
27147 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
27148 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
27149 };
27150
27151 // If wider type is not legal: split, extend, op, trunc and concat.
27152 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
27153 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
27154 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
27155 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
27156 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
27157 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
27158 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
27159}
27160
27161SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
27162 SDValue Op, SelectionDAG &DAG) const {
27163 EVT VT = Op.getValueType();
27164 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27165
27166 SDLoc DL(Op);
27167 SDValue Val = Op.getOperand(0);
27168 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
27169 Val = convertToScalableVector(DAG, ContainerVT, Val);
27170
27171 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
27172 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
27173
27174 // Repeatedly unpack Val until the result is of the desired element type.
27175 switch (ContainerVT.getSimpleVT().SimpleTy) {
27176 default:
27177 llvm_unreachable("unimplemented container type");
27178 case MVT::nxv16i8:
27179 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
27180 if (VT.getVectorElementType() == MVT::i16)
27181 break;
27182 [[fallthrough]];
27183 case MVT::nxv8i16:
27184 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
27185 if (VT.getVectorElementType() == MVT::i32)
27186 break;
27187 [[fallthrough]];
27188 case MVT::nxv4i32:
27189 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
27190 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
27191 break;
27192 }
27193
27194 return convertFromScalableVector(DAG, VT, Val);
27195}
27196
27197SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
27198 SDValue Op, SelectionDAG &DAG) const {
27199 EVT VT = Op.getValueType();
27200 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27201
27202 SDLoc DL(Op);
27203 SDValue Val = Op.getOperand(0);
27204 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
27205 Val = convertToScalableVector(DAG, ContainerVT, Val);
27206
27207 // Repeatedly truncate Val until the result is of the desired element type.
27208 switch (ContainerVT.getSimpleVT().SimpleTy) {
27209 default:
27210 llvm_unreachable("unimplemented container type");
27211 case MVT::nxv2i64:
27212 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
27213 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
27214 if (VT.getVectorElementType() == MVT::i32)
27215 break;
27216 [[fallthrough]];
27217 case MVT::nxv4i32:
27218 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
27219 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
27220 if (VT.getVectorElementType() == MVT::i16)
27221 break;
27222 [[fallthrough]];
27223 case MVT::nxv8i16:
27224 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
27225 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
27226 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
27227 break;
27228 }
27229
27230 return convertFromScalableVector(DAG, VT, Val);
27231}
27232
27233SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
27234 SDValue Op, SelectionDAG &DAG) const {
27235 EVT VT = Op.getValueType();
27236 EVT InVT = Op.getOperand(0).getValueType();
27237 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
27238
27239 SDLoc DL(Op);
27240 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27241 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
27242
27243 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
27244}
27245
27246SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
27247 SDValue Op, SelectionDAG &DAG) const {
27248 EVT VT = Op.getValueType();
27249 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27250
27251 SDLoc DL(Op);
27252 EVT InVT = Op.getOperand(0).getValueType();
27253 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27254 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
27255
27256 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
27257 Op.getOperand(1), Op.getOperand(2));
27258
27259 return convertFromScalableVector(DAG, VT, ScalableRes);
27260}
27261
27262// Convert vector operation 'Op' to an equivalent predicated operation whereby
27263// the original operation's type is used to construct a suitable predicate.
27264// NOTE: The results for inactive lanes are undefined.
27265SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
27266 SelectionDAG &DAG,
27267 unsigned NewOp) const {
27268 EVT VT = Op.getValueType();
27269 SDLoc DL(Op);
27270 auto Pg = getPredicateForVector(DAG, DL, VT);
27271
27272 if (VT.isFixedLengthVector()) {
27273 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
27274 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27275
27276 // Create list of operands by converting existing ones to scalable types.
27278 for (const SDValue &V : Op->op_values()) {
27279 if (isa<CondCodeSDNode>(V)) {
27280 Operands.push_back(V);
27281 continue;
27282 }
27283
27284 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
27285 EVT VTArg = VTNode->getVT().getVectorElementType();
27286 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
27287 Operands.push_back(DAG.getValueType(NewVTArg));
27288 continue;
27289 }
27290
27291 assert(isTypeLegal(V.getValueType()) &&
27292 "Expected only legal fixed-width types");
27293 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
27294 }
27295
27296 if (isMergePassthruOpcode(NewOp))
27297 Operands.push_back(DAG.getUNDEF(ContainerVT));
27298
27299 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
27300 return convertFromScalableVector(DAG, VT, ScalableRes);
27301 }
27302
27303 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
27304
27306 for (const SDValue &V : Op->op_values()) {
27307 assert((!V.getValueType().isVector() ||
27308 V.getValueType().isScalableVector()) &&
27309 "Only scalable vectors are supported!");
27310 Operands.push_back(V);
27311 }
27312
27313 if (isMergePassthruOpcode(NewOp))
27314 Operands.push_back(DAG.getUNDEF(VT));
27315
27316 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
27317}
27318
27319// If a fixed length vector operation has no side effects when applied to
27320// undefined elements, we can safely use scalable vectors to perform the same
27321// operation without needing to worry about predication.
27322SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
27323 SelectionDAG &DAG) const {
27324 EVT VT = Op.getValueType();
27326 "Only expected to lower fixed length vector operation!");
27327 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27328
27329 // Create list of operands by converting existing ones to scalable types.
27331 for (const SDValue &V : Op->op_values()) {
27332 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
27333
27334 // Pass through non-vector operands.
27335 if (!V.getValueType().isVector()) {
27336 Ops.push_back(V);
27337 continue;
27338 }
27339
27340 // "cast" fixed length vector to a scalable vector.
27341 assert(V.getValueType().isFixedLengthVector() &&
27342 isTypeLegal(V.getValueType()) &&
27343 "Only fixed length vectors are supported!");
27344 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
27345 }
27346
27347 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
27348 return convertFromScalableVector(DAG, VT, ScalableRes);
27349}
27350
27351SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
27352 SelectionDAG &DAG) const {
27353 SDLoc DL(ScalarOp);
27354 SDValue AccOp = ScalarOp.getOperand(0);
27355 SDValue VecOp = ScalarOp.getOperand(1);
27356 EVT SrcVT = VecOp.getValueType();
27357 EVT ResVT = SrcVT.getVectorElementType();
27358
27359 EVT ContainerVT = SrcVT;
27360 if (SrcVT.isFixedLengthVector()) {
27361 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27362 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27363 }
27364
27365 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27366 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27367
27368 // Convert operands to Scalable.
27369 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
27370 DAG.getUNDEF(ContainerVT), AccOp, Zero);
27371
27372 // Perform reduction.
27373 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
27374 Pg, AccOp, VecOp);
27375
27376 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
27377}
27378
27379SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27380 SelectionDAG &DAG) const {
27381 SDLoc DL(ReduceOp);
27382 SDValue Op = ReduceOp.getOperand(0);
27383 EVT OpVT = Op.getValueType();
27384 EVT VT = ReduceOp.getValueType();
27385
27386 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
27387 return SDValue();
27388
27389 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
27390
27391 switch (ReduceOp.getOpcode()) {
27392 default:
27393 return SDValue();
27394 case ISD::VECREDUCE_OR:
27395 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
27396 // The predicate can be 'Op' because
27397 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27398 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
27399 else
27400 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
27401 case ISD::VECREDUCE_AND: {
27402 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
27403 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
27404 }
27405 case ISD::VECREDUCE_XOR: {
27406 SDValue ID =
27407 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
27408 if (OpVT == MVT::nxv1i1) {
27409 // Emulate a CNTP on .Q using .D and a different governing predicate.
27410 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
27411 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
27412 }
27413 SDValue Cntp =
27414 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
27415 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
27416 }
27417 }
27418
27419 return SDValue();
27420}
27421
27422SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27423 SDValue ScalarOp,
27424 SelectionDAG &DAG) const {
27425 SDLoc DL(ScalarOp);
27426 SDValue VecOp = ScalarOp.getOperand(0);
27427 EVT SrcVT = VecOp.getValueType();
27428
27430 SrcVT,
27431 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27432 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27433 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27434 }
27435
27436 // UADDV always returns an i64 result.
27437 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27438 SrcVT.getVectorElementType();
27439 EVT RdxVT = SrcVT;
27440 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27441 RdxVT = getPackedSVEVectorVT(ResVT);
27442
27443 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27444 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27445 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27446 Rdx, DAG.getConstant(0, DL, MVT::i64));
27447
27448 // The VEC_REDUCE nodes expect an element size result.
27449 if (ResVT != ScalarOp.getValueType())
27450 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27451
27452 return Res;
27453}
27454
27455SDValue
27456AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27457 SelectionDAG &DAG) const {
27458 EVT VT = Op.getValueType();
27459 SDLoc DL(Op);
27460
27461 EVT InVT = Op.getOperand(1).getValueType();
27462 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27463 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27464 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27465
27466 // Convert the mask to a predicated (NOTE: We don't need to worry about
27467 // inactive lanes since VSELECT is safe when given undefined elements).
27468 EVT MaskVT = Op.getOperand(0).getValueType();
27469 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27470 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27472 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27473
27474 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27475 Mask, Op1, Op2);
27476
27477 return convertFromScalableVector(DAG, VT, ScalableRes);
27478}
27479
27480SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27481 SDValue Op, SelectionDAG &DAG) const {
27482 SDLoc DL(Op);
27483 EVT InVT = Op.getOperand(0).getValueType();
27484 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27485
27486 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27487 "Only expected to lower fixed length vector operation!");
27488 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27489 "Expected integer result of the same bit length as the inputs!");
27490
27491 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27492 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27493 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27494
27495 EVT CmpVT = Pg.getValueType();
27496 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27497 {Pg, Op1, Op2, Op.getOperand(2)});
27498
27499 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27500 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27501 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27502}
27503
27504SDValue
27505AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27506 SelectionDAG &DAG) const {
27507 SDLoc DL(Op);
27508 auto SrcOp = Op.getOperand(0);
27509 EVT VT = Op.getValueType();
27510 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27511 EVT ContainerSrcVT =
27512 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27513
27514 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27515 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27516 return convertFromScalableVector(DAG, VT, Op);
27517}
27518
27519SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27520 SDValue Op, SelectionDAG &DAG) const {
27521 SDLoc DL(Op);
27522 unsigned NumOperands = Op->getNumOperands();
27523
27524 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27525 "Unexpected number of operands in CONCAT_VECTORS");
27526
27527 auto SrcOp1 = Op.getOperand(0);
27528 auto SrcOp2 = Op.getOperand(1);
27529 EVT VT = Op.getValueType();
27530 EVT SrcVT = SrcOp1.getValueType();
27531
27532 if (NumOperands > 2) {
27534 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27535 for (unsigned I = 0; I < NumOperands; I += 2)
27536 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27537 Op->getOperand(I), Op->getOperand(I + 1)));
27538
27539 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27540 }
27541
27542 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27543
27545 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27546 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27547
27548 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27549
27550 return convertFromScalableVector(DAG, VT, Op);
27551}
27552
27553SDValue
27554AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27555 SelectionDAG &DAG) const {
27556 EVT VT = Op.getValueType();
27557 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27558
27559 SDLoc DL(Op);
27560 SDValue Val = Op.getOperand(0);
27561 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27562 EVT SrcVT = Val.getValueType();
27563 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27564 EVT ExtendVT = ContainerVT.changeVectorElementType(
27565 SrcVT.getVectorElementType());
27566
27567 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27568 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27569
27570 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27571 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27572 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27573 Pg, Val, DAG.getUNDEF(ContainerVT));
27574
27575 return convertFromScalableVector(DAG, VT, Val);
27576}
27577
27578SDValue
27579AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27580 SelectionDAG &DAG) const {
27581 EVT VT = Op.getValueType();
27582 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27583
27584 SDLoc DL(Op);
27585 SDValue Val = Op.getOperand(0);
27586 EVT SrcVT = Val.getValueType();
27587 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27588 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27590 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27591
27592 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27593 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27594 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27595 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27596 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27597
27598 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27599 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27600}
27601
27602SDValue
27603AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27604 SelectionDAG &DAG) const {
27605 EVT VT = Op.getValueType();
27606 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27607
27608 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27609 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27611
27612 SDLoc DL(Op);
27613 SDValue Val = Op.getOperand(0);
27614 EVT SrcVT = Val.getValueType();
27615 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27616 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27617
27618 if (VT.bitsGE(SrcVT)) {
27620
27621 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27622 VT.changeTypeToInteger(), Val);
27623
27624 // Safe to use a larger than specified operand because by promoting the
27625 // value nothing has changed from an arithmetic point of view.
27626 Val =
27627 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27628 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27629 DAG.getUNDEF(ContainerDstVT));
27630 return convertFromScalableVector(DAG, VT, Val);
27631 } else {
27632 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27633 ContainerDstVT.getVectorElementType());
27635
27636 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27637 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27638 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27639 Val = convertFromScalableVector(DAG, SrcVT, Val);
27640
27641 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27642 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27643 }
27644}
27645
27646SDValue
27647AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27648 SelectionDAG &DAG) const {
27649 SDLoc DL(Op);
27650 EVT OpVT = Op.getValueType();
27651 assert(OpVT.isScalableVector() &&
27652 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27653 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27654 Op.getOperand(1));
27655 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27656 Op.getOperand(1));
27657 return DAG.getMergeValues({Even, Odd}, DL);
27658}
27659
27660SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27661 SelectionDAG &DAG) const {
27662 SDLoc DL(Op);
27663 EVT OpVT = Op.getValueType();
27664 assert(OpVT.isScalableVector() &&
27665 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27666
27667 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27668 Op.getOperand(1));
27669 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27670 Op.getOperand(1));
27671 return DAG.getMergeValues({Lo, Hi}, DL);
27672}
27673
27674SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27675 SelectionDAG &DAG) const {
27676 // FIXME: Maybe share some code with LowerMGather/Scatter?
27677 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
27678 SDLoc DL(HG);
27679 SDValue Chain = HG->getChain();
27680 SDValue Inc = HG->getInc();
27681 SDValue Mask = HG->getMask();
27682 SDValue Ptr = HG->getBasePtr();
27683 SDValue Index = HG->getIndex();
27684 SDValue Scale = HG->getScale();
27685 SDValue IntID = HG->getIntID();
27686
27687 // The Intrinsic ID determines the type of update operation.
27688 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
27689 // Right now, we only support 'add' as an update.
27690 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27691 "Unexpected histogram update operation");
27692
27693 EVT IncVT = Inc.getValueType();
27694 EVT IndexVT = Index.getValueType();
27695 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
27696 IndexVT.getVectorElementCount());
27697 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27698 SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
27699 SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
27700 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27701
27702 MachineMemOperand *MMO = HG->getMemOperand();
27703 // Create an MMO for the gather, without load|store flags.
27706 MMO->getAlign(), MMO->getAAInfo());
27707 ISD::MemIndexType IndexType = HG->getIndexType();
27708 SDValue Gather =
27709 DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
27710 GMMO, IndexType, ISD::NON_EXTLOAD);
27711
27712 SDValue GChain = Gather.getValue(1);
27713
27714 // Perform the histcnt, multiply by inc, add to bucket data.
27715 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
27716 SDValue HistCnt =
27717 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
27718 SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
27719 SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
27720
27721 // Create an MMO for the scatter, without load|store flags.
27724 MMO->getAlign(), MMO->getAAInfo());
27725
27726 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
27727 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
27728 ScatterOps, SMMO, IndexType, false);
27729 return Scatter;
27730}
27731
27732SDValue
27733AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27734 SelectionDAG &DAG) const {
27735 EVT VT = Op.getValueType();
27736 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27737
27738 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27739 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27741
27742 SDLoc DL(Op);
27743 SDValue Val = Op.getOperand(0);
27744 EVT SrcVT = Val.getValueType();
27745 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27746 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27747
27748 if (VT.bitsGT(SrcVT)) {
27749 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27750 ContainerSrcVT.getVectorElementType());
27752
27753 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27754 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27755
27756 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27757 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27758 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27759 DAG.getUNDEF(ContainerDstVT));
27760 return convertFromScalableVector(DAG, VT, Val);
27761 } else {
27762 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27764
27765 // Safe to use a larger than specified result since an fp_to_int where the
27766 // result doesn't fit into the destination is undefined.
27767 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27768 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27769 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27770
27771 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27772 }
27773}
27774
27776 ArrayRef<int> ShuffleMask, EVT VT,
27777 EVT ContainerVT, SelectionDAG &DAG) {
27778 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27779 SDLoc DL(Op);
27780 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27781 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27782 bool IsSingleOp =
27783 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27784
27785 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27786 MinSVESize = 128;
27787
27788 // Ignore two operands if no SVE2 or all index numbers couldn't
27789 // be represented.
27790 if (!IsSingleOp && !Subtarget.hasSVE2())
27791 return SDValue();
27792
27793 EVT VTOp1 = Op.getOperand(0).getValueType();
27794 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27795 unsigned IndexLen = MinSVESize / BitsPerElt;
27796 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27797 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27798 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27799 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27800 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27801 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27802 "Incorrectly legalised shuffle operation");
27803
27805 // If MinSVESize is not equal to MaxSVESize then we need to know which
27806 // TBL mask element needs adjustment.
27807 SmallVector<SDValue, 8> AddRuntimeVLMask;
27808
27809 // Bail out for 8-bits element types, because with 2048-bit SVE register
27810 // size 8 bits is only sufficient to index into the first source vector.
27811 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27812 return SDValue();
27813
27814 for (int Index : ShuffleMask) {
27815 // Handling poison index value.
27816 if (Index < 0)
27817 Index = 0;
27818 // If the mask refers to elements in the second operand, then we have to
27819 // offset the index by the number of elements in a vector. If this is number
27820 // is not known at compile-time, we need to maintain a mask with 'VL' values
27821 // to add at runtime.
27822 if ((unsigned)Index >= ElementsPerVectorReg) {
27823 if (MinMaxEqual) {
27824 Index += IndexLen - ElementsPerVectorReg;
27825 } else {
27826 Index = Index - ElementsPerVectorReg;
27827 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27828 }
27829 } else if (!MinMaxEqual)
27830 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27831 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27832 // to 255, this might point to the last element of in the second operand
27833 // of the shufflevector, thus we are rejecting this transform.
27834 if ((unsigned)Index >= MaxOffset)
27835 return SDValue();
27836 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27837 }
27838
27839 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27840 // value where it would perform first lane duplication for out of
27841 // index elements. For i8 elements an out-of-range index could be a valid
27842 // for 2048-bit vector register size.
27843 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27844 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27845 if (!MinMaxEqual)
27846 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27847 }
27848
27849 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27850 SDValue VecMask =
27851 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27852 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27853
27854 SDValue Shuffle;
27855 if (IsSingleOp)
27856 Shuffle =
27857 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27858 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27859 Op1, SVEMask);
27860 else if (Subtarget.hasSVE2()) {
27861 if (!MinMaxEqual) {
27862 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27863 SDValue VScale = (BitsPerElt == 64)
27864 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27865 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27866 SDValue VecMask =
27867 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27868 SDValue MulByMask = DAG.getNode(
27869 ISD::MUL, DL, MaskType,
27870 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27871 DAG.getBuildVector(MaskType, DL,
27872 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27873 SDValue UpdatedVecMask =
27874 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27875 SVEMask = convertToScalableVector(
27876 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27877 }
27878 Shuffle =
27879 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27880 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27881 Op1, Op2, SVEMask);
27882 }
27883 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27884 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27885}
27886
27887SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27888 SDValue Op, SelectionDAG &DAG) const {
27889 EVT VT = Op.getValueType();
27890 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27891
27892 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27893 auto ShuffleMask = SVN->getMask();
27894
27895 SDLoc DL(Op);
27896 SDValue Op1 = Op.getOperand(0);
27897 SDValue Op2 = Op.getOperand(1);
27898
27899 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27900 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27901 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27902
27903 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27904 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27905 return MVT::i32;
27906 return ScalarTy;
27907 };
27908
27909 if (SVN->isSplat()) {
27910 unsigned Lane = std::max(0, SVN->getSplatIndex());
27911 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27912 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27913 DAG.getConstant(Lane, DL, MVT::i64));
27914 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27915 return convertFromScalableVector(DAG, VT, Op);
27916 }
27917
27918 bool ReverseEXT = false;
27919 unsigned Imm;
27920 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27921 Imm == VT.getVectorNumElements() - 1) {
27922 if (ReverseEXT)
27923 std::swap(Op1, Op2);
27924 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27925 SDValue Scalar = DAG.getNode(
27926 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27927 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27928 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27929 return convertFromScalableVector(DAG, VT, Op);
27930 }
27931
27932 unsigned EltSize = VT.getScalarSizeInBits();
27933 for (unsigned LaneSize : {64U, 32U, 16U}) {
27934 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
27935 EVT NewVT =
27937 unsigned RevOp;
27938 if (EltSize == 8)
27940 else if (EltSize == 16)
27942 else
27944
27945 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27946 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27947 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27948 return convertFromScalableVector(DAG, VT, Op);
27949 }
27950 }
27951
27952 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
27953 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
27954 if (!VT.isFloatingPoint())
27955 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27956
27958 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27959 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27960 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27961 return convertFromScalableVector(DAG, VT, Op);
27962 }
27963
27964 unsigned WhichResult;
27965 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
27966 WhichResult == 0)
27968 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27969
27970 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
27971 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27973 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27974 }
27975
27976 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27978 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27979
27980 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27981 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27983 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27984 }
27985
27986 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27987 // represents the same logical operation as performed by a ZIP instruction. In
27988 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27989 // equivalent to an AArch64 instruction. There's the extra component of
27990 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27991 // only operated on 64/128bit vector types that have a direct mapping to a
27992 // target register and so an exact mapping is implied.
27993 // However, when using SVE for fixed length vectors, most legal vector types
27994 // are actually sub-vectors of a larger SVE register. When mapping
27995 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27996 // how the mask's indices translate. Specifically, when the mapping requires
27997 // an exact meaning for a specific vector index (e.g. Index X is the last
27998 // vector element in the register) then such mappings are often only safe when
27999 // the exact SVE register size is know. The main exception to this is when
28000 // indices are logically relative to the first element of either
28001 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
28002 // when converting from fixed-length to scalable vector types (i.e. the start
28003 // of a fixed length vector is always the start of a scalable vector).
28004 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
28005 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
28006 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
28007 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
28008 Op2.isUndef()) {
28009 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
28010 return convertFromScalableVector(DAG, VT, Op);
28011 }
28012
28013 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
28014 WhichResult != 0)
28016 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
28017
28018 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
28019 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28021 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
28022 }
28023
28024 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
28026 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
28027
28028 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
28029 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28031 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
28032 }
28033 }
28034
28035 // Avoid producing TBL instruction if we don't know SVE register minimal size,
28036 // unless NEON is not available and we can assume minimal SVE register size is
28037 // 128-bits.
28038 if (MinSVESize || !Subtarget->isNeonAvailable())
28039 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
28040 DAG);
28041
28042 return SDValue();
28043}
28044
28045SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
28046 SelectionDAG &DAG) const {
28047 SDLoc DL(Op);
28048 EVT InVT = Op.getValueType();
28049
28050 assert(VT.isScalableVector() && isTypeLegal(VT) &&
28051 InVT.isScalableVector() && isTypeLegal(InVT) &&
28052 "Only expect to cast between legal scalable vector types!");
28053 assert(VT.getVectorElementType() != MVT::i1 &&
28054 InVT.getVectorElementType() != MVT::i1 &&
28055 "For predicate bitcasts, use getSVEPredicateBitCast");
28056
28057 if (InVT == VT)
28058 return Op;
28059
28061 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
28062
28063 // Safe bitcasting between unpacked vector types of different element counts
28064 // is currently unsupported because the following is missing the necessary
28065 // work to ensure the result's elements live where they're supposed to within
28066 // an SVE register.
28067 // 01234567
28068 // e.g. nxv2i32 = XX??XX??
28069 // nxv4f16 = X?X?X?X?
28071 VT == PackedVT || InVT == PackedInVT) &&
28072 "Unexpected bitcast!");
28073
28074 // Pack input if required.
28075 if (InVT != PackedInVT)
28076 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
28077
28078 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
28079
28080 // Unpack result if required.
28081 if (VT != PackedVT)
28083
28084 return Op;
28085}
28086
28088 SDValue N) const {
28089 return ::isAllActivePredicate(DAG, N);
28090}
28091
28093 return ::getPromotedVTForPredicate(VT);
28094}
28095
28096bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
28097 SDValue Op, const APInt &OriginalDemandedBits,
28098 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
28099 unsigned Depth) const {
28100
28101 unsigned Opc = Op.getOpcode();
28102 switch (Opc) {
28103 case AArch64ISD::VSHL: {
28104 // Match (VSHL (VLSHR Val X) X)
28105 SDValue ShiftL = Op;
28106 SDValue ShiftR = Op->getOperand(0);
28107 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
28108 return false;
28109
28110 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
28111 return false;
28112
28113 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
28114 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
28115
28116 // Other cases can be handled as well, but this is not
28117 // implemented.
28118 if (ShiftRBits != ShiftLBits)
28119 return false;
28120
28121 unsigned ScalarSize = Op.getScalarValueSizeInBits();
28122 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
28123
28124 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
28125 APInt UnusedBits = ~OriginalDemandedBits;
28126
28127 if ((ZeroBits & UnusedBits) != ZeroBits)
28128 return false;
28129
28130 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
28131 // used - simplify to just Val.
28132 return TLO.CombineTo(Op, ShiftR->getOperand(0));
28133 }
28134 case AArch64ISD::BICi: {
28135 // Fold BICi if all destination bits already known to be zeroed
28136 SDValue Op0 = Op.getOperand(0);
28137 KnownBits KnownOp0 =
28138 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
28139 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
28140 uint64_t BitsToClear = Op->getConstantOperandVal(1)
28141 << Op->getConstantOperandVal(2);
28142 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
28143 if (APInt(Known.getBitWidth(), BitsToClear)
28144 .isSubsetOf(AlreadyZeroedBitsToClear))
28145 return TLO.CombineTo(Op, Op0);
28146
28147 Known = KnownOp0 &
28148 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
28149
28150 return false;
28151 }
28153 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
28154 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
28155 if (!MaxSVEVectorSizeInBits)
28156 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
28157 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
28158 // The SVE count intrinsics don't support the multiplier immediate so we
28159 // don't have to account for that here. The value returned may be slightly
28160 // over the true required bits, as this is based on the "ALL" pattern. The
28161 // other patterns are also exposed by these intrinsics, but they all
28162 // return a value that's strictly less than "ALL".
28163 unsigned RequiredBits = llvm::bit_width(MaxElements);
28164 unsigned BitWidth = Known.Zero.getBitWidth();
28165 if (RequiredBits < BitWidth)
28166 Known.Zero.setHighBits(BitWidth - RequiredBits);
28167 return false;
28168 }
28169 }
28170 }
28171
28173 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
28174}
28175
28176bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
28177 return Op.getOpcode() == AArch64ISD::DUP ||
28178 Op.getOpcode() == AArch64ISD::MOVI ||
28179 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
28180 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
28182}
28183
28185 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
28186 Subtarget->hasComplxNum();
28187}
28188
28191 auto *VTy = dyn_cast<VectorType>(Ty);
28192 if (!VTy)
28193 return false;
28194
28195 // If the vector is scalable, SVE is enabled, implying support for complex
28196 // numbers. Otherwise, we need to ensure complex number support is available
28197 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
28198 return false;
28199
28200 auto *ScalarTy = VTy->getScalarType();
28201 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
28202
28203 // We can only process vectors that have a bit size of 128 or higher (with an
28204 // additional 64 bits for Neon). Additionally, these vectors must have a
28205 // power-of-2 size, as we later split them into the smallest supported size
28206 // and merging them back together after applying complex operation.
28207 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
28208 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
28209 !llvm::isPowerOf2_32(VTyWidth))
28210 return false;
28211
28212 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
28213 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
28214 return 8 <= ScalarWidth && ScalarWidth <= 64;
28215 }
28216
28217 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
28218 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
28219}
28220
28223 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
28224 Value *Accumulator) const {
28225 VectorType *Ty = cast<VectorType>(InputA->getType());
28226 bool IsScalable = Ty->isScalableTy();
28227 bool IsInt = Ty->getElementType()->isIntegerTy();
28228
28229 unsigned TyWidth =
28231
28232 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
28233 "Vector type must be either 64 or a power of 2 that is at least 128");
28234
28235 if (TyWidth > 128) {
28236 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
28237 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
28238 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
28239 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
28240 auto *UpperSplitA =
28241 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
28242 auto *UpperSplitB =
28243 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
28244 Value *LowerSplitAcc = nullptr;
28245 Value *UpperSplitAcc = nullptr;
28246 if (Accumulator) {
28247 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
28248 UpperSplitAcc =
28249 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
28250 }
28251 auto *LowerSplitInt = createComplexDeinterleavingIR(
28252 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
28253 auto *UpperSplitInt = createComplexDeinterleavingIR(
28254 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
28255
28256 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
28257 B.getInt64(0));
28258 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
28259 }
28260
28261 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
28262 if (Accumulator == nullptr)
28264
28265 if (IsScalable) {
28266 if (IsInt)
28267 return B.CreateIntrinsic(
28268 Intrinsic::aarch64_sve_cmla_x, Ty,
28269 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
28270
28271 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
28272 return B.CreateIntrinsic(
28273 Intrinsic::aarch64_sve_fcmla, Ty,
28274 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
28275 }
28276
28277 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
28278 Intrinsic::aarch64_neon_vcmla_rot90,
28279 Intrinsic::aarch64_neon_vcmla_rot180,
28280 Intrinsic::aarch64_neon_vcmla_rot270};
28281
28282
28283 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
28284 {Accumulator, InputA, InputB});
28285 }
28286
28287 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
28288 if (IsScalable) {
28291 if (IsInt)
28292 return B.CreateIntrinsic(
28293 Intrinsic::aarch64_sve_cadd_x, Ty,
28294 {InputA, InputB, B.getInt32((int)Rotation * 90)});
28295
28296 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
28297 return B.CreateIntrinsic(
28298 Intrinsic::aarch64_sve_fcadd, Ty,
28299 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
28300 }
28301 return nullptr;
28302 }
28303
28306 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
28308 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
28309
28310 if (IntId == Intrinsic::not_intrinsic)
28311 return nullptr;
28312
28313 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
28314 }
28315
28316 return nullptr;
28317}
28318
28319bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
28320 unsigned Opc = N->getOpcode();
28321 if (ISD::isExtOpcode(Opc)) {
28322 if (any_of(N->uses(),
28323 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
28324 return false;
28325 }
28326 return true;
28327}
28328
28329unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
28330 return Subtarget->getMinimumJumpTableEntries();
28331}
28332
28335 EVT VT) const {
28336 bool NonUnitFixedLengthVector =
28338 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28340
28341 EVT VT1;
28342 MVT RegisterVT;
28343 unsigned NumIntermediates;
28344 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
28345 RegisterVT);
28346 return RegisterVT;
28347}
28348
28350 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
28351 bool NonUnitFixedLengthVector =
28353 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28355
28356 EVT VT1;
28357 MVT VT2;
28358 unsigned NumIntermediates;
28359 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
28360 NumIntermediates, VT2);
28361}
28362
28364 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
28365 unsigned &NumIntermediates, MVT &RegisterVT) const {
28367 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
28368 if (!RegisterVT.isFixedLengthVector() ||
28369 RegisterVT.getFixedSizeInBits() <= 128)
28370 return NumRegs;
28371
28372 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
28373 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
28374 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
28375
28376 // A size mismatch here implies either type promotion or widening and would
28377 // have resulted in scalarisation if larger vectors had not be available.
28378 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
28379 EVT EltTy = VT.getVectorElementType();
28380 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
28381 if (!isTypeLegal(NewVT))
28382 NewVT = EltTy;
28383
28384 IntermediateVT = NewVT;
28385 NumIntermediates = VT.getVectorNumElements();
28386 RegisterVT = getRegisterType(Context, NewVT);
28387 return NumIntermediates;
28388 }
28389
28390 // SVE VLS support does not introduce a new ABI so we should use NEON sized
28391 // types for vector arguments and returns.
28392
28393 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
28394 NumIntermediates *= NumSubRegs;
28395 NumRegs *= NumSubRegs;
28396
28397 switch (RegisterVT.getVectorElementType().SimpleTy) {
28398 default:
28399 llvm_unreachable("unexpected element type for vector");
28400 case MVT::i8:
28401 IntermediateVT = RegisterVT = MVT::v16i8;
28402 break;
28403 case MVT::i16:
28404 IntermediateVT = RegisterVT = MVT::v8i16;
28405 break;
28406 case MVT::i32:
28407 IntermediateVT = RegisterVT = MVT::v4i32;
28408 break;
28409 case MVT::i64:
28410 IntermediateVT = RegisterVT = MVT::v2i64;
28411 break;
28412 case MVT::f16:
28413 IntermediateVT = RegisterVT = MVT::v8f16;
28414 break;
28415 case MVT::f32:
28416 IntermediateVT = RegisterVT = MVT::v4f32;
28417 break;
28418 case MVT::f64:
28419 IntermediateVT = RegisterVT = MVT::v2f64;
28420 break;
28421 case MVT::bf16:
28422 IntermediateVT = RegisterVT = MVT::v8bf16;
28423 break;
28424 }
28425
28426 return NumRegs;
28427}
28428
28430 const MachineFunction &MF) const {
28431 return !Subtarget->isTargetWindows() &&
28432 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28433}
28434
28435#ifndef NDEBUG
28437 switch (N->getOpcode()) {
28438 default:
28439 break;
28443 case AArch64ISD::UUNPKHI: {
28444 assert(N->getNumValues() == 1 && "Expected one result!");
28445 assert(N->getNumOperands() == 1 && "Expected one operand!");
28446 EVT VT = N->getValueType(0);
28447 EVT OpVT = N->getOperand(0).getValueType();
28448 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28449 VT.isInteger() && "Expected integer vectors!");
28450 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28451 "Expected vectors of equal size!");
28452 // TODO: Enable assert once bogus creations have been fixed.
28453 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
28454 // "Expected result vector with half the lanes of its input!");
28455 break;
28456 }
28457 case AArch64ISD::TRN1:
28458 case AArch64ISD::TRN2:
28459 case AArch64ISD::UZP1:
28460 case AArch64ISD::UZP2:
28461 case AArch64ISD::ZIP1:
28462 case AArch64ISD::ZIP2: {
28463 assert(N->getNumValues() == 1 && "Expected one result!");
28464 assert(N->getNumOperands() == 2 && "Expected two operands!");
28465 EVT VT = N->getValueType(0);
28466 EVT Op0VT = N->getOperand(0).getValueType();
28467 EVT Op1VT = N->getOperand(1).getValueType();
28468 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28469 "Expected vectors!");
28470 // TODO: Enable assert once bogus creations have been fixed.
28471 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28472 break;
28473 }
28474 }
28475}
28476#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F32
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:428
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:208
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1371
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:350
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:188
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:308
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1145
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1597
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:198
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1718
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:806
bool isMask(unsigned numBits) const
Definition: APInt.h:467
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:313
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1236
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:419
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:285
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:275
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1216
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:368
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
an instruction to allocate memory on the stack
Definition: Instructions.h:60
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:494
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:725
@ Or
*p = old | v
Definition: Instructions.h:719
@ And
*p = old & v
Definition: Instructions.h:715
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:723
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:729
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:727
@ Nand
*p = ~(old & v)
Definition: Instructions.h:717
bool isFloatingPointOperation() const
Definition: Instructions.h:863
BinOp getOperation() const
Definition: Instructions.h:786
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:551
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
bool empty() const
Definition: Function.h:822
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1934
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
arg_iterator arg_end()
Definition: Function.h:840
arg_iterator arg_begin()
Definition: Function.h:831
size_t size() const
Definition: Function.h:821
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:690
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:124
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:92
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2135
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1035
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2470
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1877
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1043
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:537
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2168
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1192
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2514
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:464
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2065
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2120
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1435
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:474
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2079
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:172
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1864
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:489
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2125
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1414
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2019
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2492
LLVMContext & getContext() const
Definition: IRBuilder.h:174
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2115
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2005
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1495
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:567
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2410
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:514
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:173
Value * getPointerOperand()
Definition: Instructions.h:252
Type * getPointerOperandType() const
Definition: Instructions.h:255
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:701
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:746
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:494
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:673
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:876
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:782
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:859
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:753
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:289
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:463
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:677
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:634
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:251
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1147
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1143
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:484
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1360
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1391
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:567
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1176
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1262
@ STRICT_FCEIL
Definition: ISDOpcodes.h:434
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1042
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:444
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1376
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1380
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1046
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1390
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:485
@ STRICT_FLOG2
Definition: ISDOpcodes.h:429
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1288
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1289
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1431
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:899
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:670
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:458
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:628
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1373
@ WRITE_REGISTER
Definition: ISDOpcodes.h:125
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1242
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1377
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1009
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:772
@ STRICT_LROUND
Definition: ISDOpcodes.h:439
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1098
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1073
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:594
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:654
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ STRICT_FPOWI
Definition: ISDOpcodes.h:420
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1258
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1392
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:635
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1172
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:438
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1385
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:894
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1037
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1287
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:90
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1286
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:443
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:432
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1232
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:870
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:433
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1350
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1269
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1236
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ STRICT_LRINT
Definition: ISDOpcodes.h:441
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:599
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ STRICT_FROUND
Definition: ISDOpcodes.h:436
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:457
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1393
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:435
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:437
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1284
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:451
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:473
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:450
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1285
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1203
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:478
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:665
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:428
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:442
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:620
@ STRICT_FEXP2
Definition: ISDOpcodes.h:426
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1283
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:959
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:112
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ STRICT_LLROUND
Definition: ISDOpcodes.h:440
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:845
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1422
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:431
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:869
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1381
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1167
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1091
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:777
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:430
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:588
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1625
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1516
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1503
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1534
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1505
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1522
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:317
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:428
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:300
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64