LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
848
850 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
853 } else {
856 }
859
860 // Generate outline atomics library calls only if LSE was not specified for
861 // subtarget
862 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
888#define LCALLNAMES(A, B, N) \
889 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
890 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
891 setLibcallName(A##N##_REL, #B #N "_rel"); \
892 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
893#define LCALLNAME4(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
896#define LCALLNAME5(A, B) \
897 LCALLNAMES(A, B, 1) \
898 LCALLNAMES(A, B, 2) \
899 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
900 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
903 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
904 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
905 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
906#undef LCALLNAMES
907#undef LCALLNAME4
908#undef LCALLNAME5
909 }
910
911 if (Subtarget->hasLSE128()) {
912 // Custom lowering because i128 is not legal. Must be replaced by 2x64
913 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
917 }
918
919 // 128-bit loads and stores can be done without expanding
922
923 // Aligned 128-bit loads and stores are single-copy atomic according to the
924 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
925 if (Subtarget->hasLSE2()) {
928 }
929
930 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
931 // custom lowering, as there are no un-paired non-temporal stores and
932 // legalization will break up 256 bit inputs.
934 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
935 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
936 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
941
942 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
943 // custom lowering, as there are no un-paired non-temporal loads legalization
944 // will break up 256 bit inputs.
945 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
946 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
947 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
948 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
949 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
950 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
951 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
952 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
953
954 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
956
957 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
958 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
959 // Issue __sincos_stret if available.
962 } else {
965 }
966
967 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
968 // MSVCRT doesn't have powi; fall back to pow
969 setLibcallName(RTLIB::POWI_F32, nullptr);
970 setLibcallName(RTLIB::POWI_F64, nullptr);
971 }
972
973 // Make floating-point constants legal for the large code model, so they don't
974 // become loads from the constant pool.
975 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
978 }
979
980 // AArch64 does not have floating-point extending loads, i1 sign-extending
981 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
982 for (MVT VT : MVT::fp_valuetypes()) {
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
985 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
986 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
987 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
988 }
989 for (MVT VT : MVT::integer_valuetypes())
991
992 for (MVT WideVT : MVT::fp_valuetypes()) {
993 for (MVT NarrowVT : MVT::fp_valuetypes()) {
994 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
995 setTruncStoreAction(WideVT, NarrowVT, Expand);
996 }
997 }
998 }
999
1000 if (Subtarget->hasFPARMv8()) {
1004 }
1005
1006 // Indexed loads and stores are supported.
1007 for (unsigned im = (unsigned)ISD::PRE_INC;
1009 setIndexedLoadAction(im, MVT::i8, Legal);
1010 setIndexedLoadAction(im, MVT::i16, Legal);
1011 setIndexedLoadAction(im, MVT::i32, Legal);
1012 setIndexedLoadAction(im, MVT::i64, Legal);
1013 setIndexedLoadAction(im, MVT::f64, Legal);
1014 setIndexedLoadAction(im, MVT::f32, Legal);
1015 setIndexedLoadAction(im, MVT::f16, Legal);
1016 setIndexedLoadAction(im, MVT::bf16, Legal);
1017 setIndexedStoreAction(im, MVT::i8, Legal);
1018 setIndexedStoreAction(im, MVT::i16, Legal);
1019 setIndexedStoreAction(im, MVT::i32, Legal);
1020 setIndexedStoreAction(im, MVT::i64, Legal);
1021 setIndexedStoreAction(im, MVT::f64, Legal);
1022 setIndexedStoreAction(im, MVT::f32, Legal);
1023 setIndexedStoreAction(im, MVT::f16, Legal);
1024 setIndexedStoreAction(im, MVT::bf16, Legal);
1025 }
1026
1027 // Trap.
1028 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1031
1032 // We combine OR nodes for bitfield operations.
1034 // Try to create BICs for vector ANDs.
1036
1037 // Vector add and sub nodes may conceal a high-half opportunity.
1038 // Also, try to fold ADD into CSINC/CSINV..
1041
1044
1045 // Try and combine setcc with csel
1047
1049
1056
1058
1060
1062
1066
1068
1070
1072
1074
1078
1080
1081 // In case of strict alignment, avoid an excessive number of byte wide stores.
1084 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1085
1089 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1090
1093
1096 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1097
1099
1101
1102 EnableExtLdPromotion = true;
1103
1104 // Set required alignment.
1106 // Set preferred alignments.
1107
1108 // Don't align loops on Windows. The SEH unwind info generation needs to
1109 // know the exact length of functions before the alignments have been
1110 // expanded.
1111 if (!Subtarget->isTargetWindows())
1115
1116 // Only change the limit for entries in a jump table if specified by
1117 // the sub target, but not at the command line.
1118 unsigned MaxJT = STI.getMaximumJumpTableSize();
1119 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1121
1123
1125
1127
1128 if (Subtarget->hasNEON()) {
1129 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1130 // silliness like this:
1131 for (auto Op :
1149 setOperationAction(Op, MVT::v1f64, Expand);
1150
1151 for (auto Op :
1156 setOperationAction(Op, MVT::v1i64, Expand);
1157
1158 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1159 // elements smaller than i32, so promote the input to i32 first.
1160 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1161 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1162
1163 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1164 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1165 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1168 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1170
1171 if (Subtarget->hasFullFP16()) {
1174
1183 } else {
1184 // when AArch64 doesn't have fullfp16 support, promote the input
1185 // to i32 first.
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1188 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1191 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1192 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1193 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1194 }
1195
1196 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1197 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1204 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1209 }
1210
1211 // Custom handling for some quad-vector types to detect MULL.
1212 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1215 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1216 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1217 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1218
1219 // Saturates
1220 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1221 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1226 }
1227
1228 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1229 MVT::v4i32}) {
1236 }
1237
1238 // Vector reductions
1239 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1240 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1241 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1246
1248 }
1249 }
1250 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1251 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1260 }
1265
1267 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1268 // Likewise, narrowing and extending vector loads/stores aren't handled
1269 // directly.
1272
1273 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1276 } else {
1279 }
1282
1285
1286 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1287 setTruncStoreAction(VT, InnerVT, Expand);
1288 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1289 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1290 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1291 }
1292 }
1293
1294 // AArch64 has implementations of a lot of rounding-like FP operations.
1295 for (auto Op :
1300 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1302 if (Subtarget->hasFullFP16())
1303 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1305 }
1306
1307 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1308
1313
1317
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1321 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1322 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1323 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1324
1325 // ADDP custom lowering
1326 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1328 // FADDP custom lowering
1329 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1331 }
1332
1333 if (Subtarget->hasSME()) {
1335 }
1336
1337 // FIXME: Move lowering for more nodes here if those are common between
1338 // SVE and SME.
1339 if (Subtarget->hasSVEorSME()) {
1340 for (auto VT :
1341 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1346 }
1347 }
1348
1349 if (Subtarget->hasSVEorSME()) {
1350 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1393
1399
1408
1413
1414 if (!Subtarget->isLittleEndian())
1416
1417 if (Subtarget->hasSVE2orSME())
1418 // For SLI/SRI.
1420 }
1421
1422 // Illegal unpacked integer vector types.
1423 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1426 }
1427
1428 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1429 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1430 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1432
1433 for (auto VT :
1434 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1435 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1437
1438 for (auto VT :
1439 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1447
1451
1452 // There are no legal MVT::nxv16f## based types.
1453 if (VT != MVT::nxv16i1) {
1456 }
1457 }
1458
1459 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1460 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1461 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1462 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1467 }
1468
1469 // Firstly, exclude all scalable vector extending loads/truncating stores,
1470 // include both integer and floating scalable vector.
1472 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1473 setTruncStoreAction(VT, InnerVT, Expand);
1474 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1475 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1476 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1477 }
1478 }
1479
1480 // Then, selectively enable those which we directly support.
1481 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1482 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1483 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1484 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1485 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1486 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1487 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1488 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1491 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1492 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1493 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1494 }
1495
1496 // SVE supports truncating stores of 64 and 128-bit vectors
1497 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1499 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1500 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1501 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1502
1503 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1504 MVT::nxv4f32, MVT::nxv2f64}) {
1540 if (Subtarget->isSVEAvailable())
1545
1559
1571
1572 if (!Subtarget->isLittleEndian())
1574 }
1575
1576 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1583
1584 if (!Subtarget->isLittleEndian())
1586 }
1587
1590
1591 // NEON doesn't support integer divides, but SVE does
1592 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1593 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1596 }
1597
1598 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1599 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1600 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1601
1602 if (Subtarget->isSVEAvailable()) {
1603 // NEON doesn't support across-vector reductions, but SVE does.
1604 for (auto VT :
1605 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1607 }
1608
1609 // NOTE: Currently this has to happen after computeRegisterProperties rather
1610 // than the preferred option of combining it with the addRegisterClass call.
1611 if (Subtarget->useSVEForFixedLengthVectors()) {
1614 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1615 addTypeForFixedLengthSVE(VT);
1616 }
1619 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1620 addTypeForFixedLengthSVE(VT);
1621 }
1622
1623 // 64bit results can mean a bigger than NEON input.
1624 for (auto VT : {MVT::v8i8, MVT::v4i16})
1627
1628 // 128bit results imply a bigger than NEON input.
1629 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1631 for (auto VT : {MVT::v8f16, MVT::v4f32})
1633
1634 // These operations are not supported on NEON but SVE can do them.
1636 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1637 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1638 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1639 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1640 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1641 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1642 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1643 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1644 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1645 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1646 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1647 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1648 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1649 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1650 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1655
1656 // Int operations with no NEON support.
1657 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1658 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1666 }
1667
1668
1669 // Use SVE for vectors with more than 2 elements.
1670 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1672 }
1673
1674 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1675 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1676 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1677 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1678
1680 }
1681
1682 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1683 // Only required for llvm.aarch64.mops.memset.tag
1685 }
1686
1688
1689 if (Subtarget->hasSVE()) {
1694 }
1695
1696 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1697
1698 IsStrictFPEnabled = true;
1700
1701 if (Subtarget->isWindowsArm64EC()) {
1702 // FIXME: are there intrinsics we need to exclude from this?
1703 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1704 auto code = static_cast<RTLIB::Libcall>(i);
1705 auto libcallName = getLibcallName(code);
1706 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1707 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1708 }
1709 }
1710 }
1711}
1712
1713void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1714 assert(VT.isVector() && "VT should be a vector type");
1715
1716 if (VT.isFloatingPoint()) {
1718 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1719 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1720 }
1721
1722 // Mark vector float intrinsics as expand.
1723 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1733 }
1734
1735 // But we do support custom-lowering for FCOPYSIGN.
1736 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1737 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1738 VT == MVT::v8f16) &&
1739 Subtarget->hasFullFP16()))
1741
1754
1758 for (MVT InnerVT : MVT::all_valuetypes())
1759 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1760
1761 // CNT supports only B element sizes, then use UADDLP to widen.
1762 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1764
1770
1771 for (unsigned Opcode :
1774 setOperationAction(Opcode, VT, Custom);
1775
1776 if (!VT.isFloatingPoint())
1778
1779 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1780 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1781 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1782 setOperationAction(Opcode, VT, Legal);
1783
1784 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1785 // NEON types.
1786 if (VT.isFloatingPoint() &&
1787 VT.getVectorElementType() != MVT::bf16 &&
1788 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1789 for (unsigned Opcode :
1795 setOperationAction(Opcode, VT, Legal);
1796
1797 // Strict fp extend and trunc are legal
1798 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1800 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1802
1803 // FIXME: We could potentially make use of the vector comparison instructions
1804 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1805 // complications:
1806 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1807 // so we would need to expand when the condition code doesn't match the
1808 // kind of comparison.
1809 // * Some kinds of comparison require more than one FCMXY instruction so
1810 // would need to be expanded instead.
1811 // * The lowering of the non-strict versions involves target-specific ISD
1812 // nodes so we would likely need to add strict versions of all of them and
1813 // handle them appropriately.
1816
1817 if (Subtarget->isLittleEndian()) {
1818 for (unsigned im = (unsigned)ISD::PRE_INC;
1822 }
1823 }
1824
1825 if (Subtarget->hasD128()) {
1828 }
1829}
1830
1832 EVT OpVT) const {
1833 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1834 if (!Subtarget->hasSVE())
1835 return true;
1836
1837 // We can only support legal predicate result types. We can use the SVE
1838 // whilelo instruction for generating fixed-width predicates too.
1839 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1840 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1841 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1842 return true;
1843
1844 // The whilelo instruction only works with i32 or i64 scalar inputs.
1845 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1846 return true;
1847
1848 return false;
1849}
1850
1852 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1853}
1854
1855void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1856 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1857
1858 // By default everything must be expanded.
1859 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1861
1862 if (VT.isFloatingPoint()) {
1872 }
1873
1875 VT == MVT::v1f64 ? Expand : Custom;
1876
1877 // Mark integer truncating stores/extending loads as having custom lowering
1878 if (VT.isInteger()) {
1879 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1880 while (InnerVT != VT) {
1881 setTruncStoreAction(VT, InnerVT, Default);
1882 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1883 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1884 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1885 InnerVT = InnerVT.changeVectorElementType(
1886 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1887 }
1888 }
1889
1890 // Mark floating-point truncating stores/extending loads as having custom
1891 // lowering
1892 if (VT.isFloatingPoint()) {
1893 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1894 while (InnerVT != VT) {
1895 setTruncStoreAction(VT, InnerVT, Custom);
1896 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1897 InnerVT = InnerVT.changeVectorElementType(
1899 }
1900 }
1901
1902 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1903 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1904
1905 // Lower fixed length vector operations to scalable equivalents.
1910 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
1945 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
1946 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
1948 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
1967 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
1993}
1994
1995void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1996 addRegisterClass(VT, &AArch64::FPR64RegClass);
1997 addTypeForNEON(VT);
1998}
1999
2000void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2001 addRegisterClass(VT, &AArch64::FPR128RegClass);
2002 addTypeForNEON(VT);
2003}
2004
2006 LLVMContext &C, EVT VT) const {
2007 if (!VT.isVector())
2008 return MVT::i32;
2009 if (VT.isScalableVector())
2010 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2012}
2013
2014// isIntImmediate - This method tests to see if the node is a constant
2015// operand. If so Imm will receive the value.
2016static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2017 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2018 Imm = C->getZExtValue();
2019 return true;
2020 }
2021 return false;
2022}
2023
2024// isOpcWithIntImmediate - This method tests to see if the node is a specific
2025// opcode and that it has a immediate integer right operand.
2026// If so Imm will receive the value.
2027static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2028 uint64_t &Imm) {
2029 return N->getOpcode() == Opc &&
2030 isIntImmediate(N->getOperand(1).getNode(), Imm);
2031}
2032
2033static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2034 const APInt &Demanded,
2036 unsigned NewOpc) {
2037 uint64_t OldImm = Imm, NewImm, Enc;
2038 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2039
2040 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2041 // bimm64.
2042 if (Imm == 0 || Imm == Mask ||
2044 return false;
2045
2046 unsigned EltSize = Size;
2047 uint64_t DemandedBits = Demanded.getZExtValue();
2048
2049 // Clear bits that are not demanded.
2050 Imm &= DemandedBits;
2051
2052 while (true) {
2053 // The goal here is to set the non-demanded bits in a way that minimizes
2054 // the number of switching between 0 and 1. In order to achieve this goal,
2055 // we set the non-demanded bits to the value of the preceding demanded bits.
2056 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2057 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2058 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2059 // The final result is 0b11000011.
2060 uint64_t NonDemandedBits = ~DemandedBits;
2061 uint64_t InvertedImm = ~Imm & DemandedBits;
2062 uint64_t RotatedImm =
2063 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2064 NonDemandedBits;
2065 uint64_t Sum = RotatedImm + NonDemandedBits;
2066 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2067 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2068 NewImm = (Imm | Ones) & Mask;
2069
2070 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2071 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2072 // we halve the element size and continue the search.
2073 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2074 break;
2075
2076 // We cannot shrink the element size any further if it is 2-bits.
2077 if (EltSize == 2)
2078 return false;
2079
2080 EltSize /= 2;
2081 Mask >>= EltSize;
2082 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2083
2084 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2085 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2086 return false;
2087
2088 // Merge the upper and lower halves of Imm and DemandedBits.
2089 Imm |= Hi;
2090 DemandedBits |= DemandedBitsHi;
2091 }
2092
2093 ++NumOptimizedImms;
2094
2095 // Replicate the element across the register width.
2096 while (EltSize < Size) {
2097 NewImm |= NewImm << EltSize;
2098 EltSize *= 2;
2099 }
2100
2101 (void)OldImm;
2102 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2103 "demanded bits should never be altered");
2104 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2105
2106 // Create the new constant immediate node.
2107 EVT VT = Op.getValueType();
2108 SDLoc DL(Op);
2109 SDValue New;
2110
2111 // If the new constant immediate is all-zeros or all-ones, let the target
2112 // independent DAG combine optimize this node.
2113 if (NewImm == 0 || NewImm == OrigMask) {
2114 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2115 TLO.DAG.getConstant(NewImm, DL, VT));
2116 // Otherwise, create a machine node so that target independent DAG combine
2117 // doesn't undo this optimization.
2118 } else {
2120 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2121 New = SDValue(
2122 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2123 }
2124
2125 return TLO.CombineTo(Op, New);
2126}
2127
2129 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2130 TargetLoweringOpt &TLO) const {
2131 // Delay this optimization to as late as possible.
2132 if (!TLO.LegalOps)
2133 return false;
2134
2136 return false;
2137
2138 EVT VT = Op.getValueType();
2139 if (VT.isVector())
2140 return false;
2141
2142 unsigned Size = VT.getSizeInBits();
2143 assert((Size == 32 || Size == 64) &&
2144 "i32 or i64 is expected after legalization.");
2145
2146 // Exit early if we demand all bits.
2147 if (DemandedBits.popcount() == Size)
2148 return false;
2149
2150 unsigned NewOpc;
2151 switch (Op.getOpcode()) {
2152 default:
2153 return false;
2154 case ISD::AND:
2155 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2156 break;
2157 case ISD::OR:
2158 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2159 break;
2160 case ISD::XOR:
2161 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2162 break;
2163 }
2164 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2165 if (!C)
2166 return false;
2167 uint64_t Imm = C->getZExtValue();
2168 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2169}
2170
2171/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2172/// Mask are known to be either zero or one and return them Known.
2174 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2175 const SelectionDAG &DAG, unsigned Depth) const {
2176 switch (Op.getOpcode()) {
2177 default:
2178 break;
2179 case AArch64ISD::DUP: {
2180 SDValue SrcOp = Op.getOperand(0);
2181 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2182 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2183 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2184 "Expected DUP implicit truncation");
2185 Known = Known.trunc(Op.getScalarValueSizeInBits());
2186 }
2187 break;
2188 }
2189 case AArch64ISD::CSEL: {
2190 KnownBits Known2;
2191 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2192 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2193 Known = Known.intersectWith(Known2);
2194 break;
2195 }
2196 case AArch64ISD::BICi: {
2197 // Compute the bit cleared value.
2198 uint64_t Mask =
2199 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2200 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2201 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2202 break;
2203 }
2204 case AArch64ISD::VLSHR: {
2205 KnownBits Known2;
2206 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2207 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2208 Known = KnownBits::lshr(Known, Known2);
2209 break;
2210 }
2211 case AArch64ISD::VASHR: {
2212 KnownBits Known2;
2213 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2214 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2215 Known = KnownBits::ashr(Known, Known2);
2216 break;
2217 }
2218 case AArch64ISD::VSHL: {
2219 KnownBits Known2;
2220 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2221 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2222 Known = KnownBits::shl(Known, Known2);
2223 break;
2224 }
2225 case AArch64ISD::MOVI: {
2227 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2228 break;
2229 }
2231 case AArch64ISD::ADDlow: {
2232 if (!Subtarget->isTargetILP32())
2233 break;
2234 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2235 Known.Zero = APInt::getHighBitsSet(64, 32);
2236 break;
2237 }
2239 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2240 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2241 break;
2242 }
2244 Intrinsic::ID IntID =
2245 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2246 switch (IntID) {
2247 default: return;
2248 case Intrinsic::aarch64_ldaxr:
2249 case Intrinsic::aarch64_ldxr: {
2250 unsigned BitWidth = Known.getBitWidth();
2251 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2252 unsigned MemBits = VT.getScalarSizeInBits();
2253 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2254 return;
2255 }
2256 }
2257 break;
2258 }
2260 case ISD::INTRINSIC_VOID: {
2261 unsigned IntNo = Op.getConstantOperandVal(0);
2262 switch (IntNo) {
2263 default:
2264 break;
2265 case Intrinsic::aarch64_neon_uaddlv: {
2266 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2267 unsigned BitWidth = Known.getBitWidth();
2268 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2269 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2270 assert(BitWidth >= Bound && "Unexpected width!");
2272 Known.Zero |= Mask;
2273 }
2274 break;
2275 }
2276 case Intrinsic::aarch64_neon_umaxv:
2277 case Intrinsic::aarch64_neon_uminv: {
2278 // Figure out the datatype of the vector operand. The UMINV instruction
2279 // will zero extend the result, so we can mark as known zero all the
2280 // bits larger than the element datatype. 32-bit or larget doesn't need
2281 // this as those are legal types and will be handled by isel directly.
2282 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2283 unsigned BitWidth = Known.getBitWidth();
2284 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2285 assert(BitWidth >= 8 && "Unexpected width!");
2287 Known.Zero |= Mask;
2288 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2289 assert(BitWidth >= 16 && "Unexpected width!");
2291 Known.Zero |= Mask;
2292 }
2293 break;
2294 } break;
2295 }
2296 }
2297 }
2298}
2299
2301 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2302 unsigned Depth) const {
2303 EVT VT = Op.getValueType();
2304 unsigned VTBits = VT.getScalarSizeInBits();
2305 unsigned Opcode = Op.getOpcode();
2306 switch (Opcode) {
2307 case AArch64ISD::CMEQ:
2308 case AArch64ISD::CMGE:
2309 case AArch64ISD::CMGT:
2310 case AArch64ISD::CMHI:
2311 case AArch64ISD::CMHS:
2312 case AArch64ISD::FCMEQ:
2313 case AArch64ISD::FCMGE:
2314 case AArch64ISD::FCMGT:
2315 case AArch64ISD::CMEQz:
2316 case AArch64ISD::CMGEz:
2317 case AArch64ISD::CMGTz:
2318 case AArch64ISD::CMLEz:
2319 case AArch64ISD::CMLTz:
2320 case AArch64ISD::FCMEQz:
2321 case AArch64ISD::FCMGEz:
2322 case AArch64ISD::FCMGTz:
2323 case AArch64ISD::FCMLEz:
2324 case AArch64ISD::FCMLTz:
2325 // Compares return either 0 or all-ones
2326 return VTBits;
2327 }
2328
2329 return 1;
2330}
2331
2333 EVT) const {
2334 return MVT::i64;
2335}
2336
2338 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2339 unsigned *Fast) const {
2340 if (Subtarget->requiresStrictAlign())
2341 return false;
2342
2343 if (Fast) {
2344 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2345 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2346 // See comments in performSTORECombine() for more details about
2347 // these conditions.
2348
2349 // Code that uses clang vector extensions can mark that it
2350 // wants unaligned accesses to be treated as fast by
2351 // underspecifying alignment to be 1 or 2.
2352 Alignment <= 2 ||
2353
2354 // Disregard v2i64. Memcpy lowering produces those and splitting
2355 // them regresses performance on micro-benchmarks and olden/bh.
2356 VT == MVT::v2i64;
2357 }
2358 return true;
2359}
2360
2361// Same as above but handling LLTs instead.
2363 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2364 unsigned *Fast) const {
2365 if (Subtarget->requiresStrictAlign())
2366 return false;
2367
2368 if (Fast) {
2369 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2370 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2371 Ty.getSizeInBytes() != 16 ||
2372 // See comments in performSTORECombine() for more details about
2373 // these conditions.
2374
2375 // Code that uses clang vector extensions can mark that it
2376 // wants unaligned accesses to be treated as fast by
2377 // underspecifying alignment to be 1 or 2.
2378 Alignment <= 2 ||
2379
2380 // Disregard v2i64. Memcpy lowering produces those and splitting
2381 // them regresses performance on micro-benchmarks and olden/bh.
2382 Ty == LLT::fixed_vector(2, 64);
2383 }
2384 return true;
2385}
2386
2387FastISel *
2389 const TargetLibraryInfo *libInfo) const {
2390 return AArch64::createFastISel(funcInfo, libInfo);
2391}
2392
2393const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2394#define MAKE_CASE(V) \
2395 case V: \
2396 return #V;
2397 switch ((AArch64ISD::NodeType)Opcode) {
2399 break;
2716 }
2717#undef MAKE_CASE
2718 return nullptr;
2719}
2720
2723 MachineBasicBlock *MBB) const {
2724 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2725 // phi node:
2726
2727 // OrigBB:
2728 // [... previous instrs leading to comparison ...]
2729 // b.ne TrueBB
2730 // b EndBB
2731 // TrueBB:
2732 // ; Fallthrough
2733 // EndBB:
2734 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2735
2736 MachineFunction *MF = MBB->getParent();
2737 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2738 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2739 DebugLoc DL = MI.getDebugLoc();
2741
2742 Register DestReg = MI.getOperand(0).getReg();
2743 Register IfTrueReg = MI.getOperand(1).getReg();
2744 Register IfFalseReg = MI.getOperand(2).getReg();
2745 unsigned CondCode = MI.getOperand(3).getImm();
2746 bool NZCVKilled = MI.getOperand(4).isKill();
2747
2748 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2749 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2750 MF->insert(It, TrueBB);
2751 MF->insert(It, EndBB);
2752
2753 // Transfer rest of current basic-block to EndBB
2754 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2755 MBB->end());
2757
2758 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2759 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2760 MBB->addSuccessor(TrueBB);
2761 MBB->addSuccessor(EndBB);
2762
2763 // TrueBB falls through to the end.
2764 TrueBB->addSuccessor(EndBB);
2765
2766 if (!NZCVKilled) {
2767 TrueBB->addLiveIn(AArch64::NZCV);
2768 EndBB->addLiveIn(AArch64::NZCV);
2769 }
2770
2771 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2772 .addReg(IfTrueReg)
2773 .addMBB(TrueBB)
2774 .addReg(IfFalseReg)
2775 .addMBB(MBB);
2776
2777 MI.eraseFromParent();
2778 return EndBB;
2779}
2780
2782 MachineInstr &MI, MachineBasicBlock *BB) const {
2784 BB->getParent()->getFunction().getPersonalityFn())) &&
2785 "SEH does not use catchret!");
2786 return BB;
2787}
2788
2791 MachineBasicBlock *MBB) const {
2792 MachineFunction &MF = *MBB->getParent();
2793 MachineBasicBlock::iterator MBBI = MI.getIterator();
2795 const AArch64InstrInfo &TII =
2796 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2797 Register TargetReg = MI.getOperand(0).getReg();
2799 TII.probedStackAlloc(MBBI, TargetReg, false);
2800
2801 MI.eraseFromParent();
2802 return NextInst->getParent();
2803}
2804
2806AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2808 MachineBasicBlock *BB) const {
2809 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2810 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2811
2812 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2813 MIB.add(MI.getOperand(1)); // slice index register
2814 MIB.add(MI.getOperand(2)); // slice index offset
2815 MIB.add(MI.getOperand(3)); // pg
2816 MIB.add(MI.getOperand(4)); // base
2817 MIB.add(MI.getOperand(5)); // offset
2818
2819 MI.eraseFromParent(); // The pseudo is gone now.
2820 return BB;
2821}
2822
2825 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2827 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2828
2829 MIB.addReg(AArch64::ZA, RegState::Define);
2830 MIB.add(MI.getOperand(0)); // Vector select register
2831 MIB.add(MI.getOperand(1)); // Vector select offset
2832 MIB.add(MI.getOperand(2)); // Base
2833 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2834
2835 MI.eraseFromParent(); // The pseudo is gone now.
2836 return BB;
2837}
2838
2841 unsigned Opcode,
2842 bool Op0IsDef) const {
2843 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2845
2846 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2847 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2848 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2849 MIB.add(MI.getOperand(I));
2850
2851 MI.eraseFromParent(); // The pseudo is gone now.
2852 return BB;
2853}
2854
2856AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2858 MachineBasicBlock *BB, bool HasTile) const {
2859 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2860 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2861 unsigned StartIdx = 0;
2862
2863 if (HasTile) {
2864 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2865 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2866 StartIdx = 1;
2867 } else
2868 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2869
2870 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2871 MIB.add(MI.getOperand(I));
2872
2873 MI.eraseFromParent(); // The pseudo is gone now.
2874 return BB;
2875}
2876
2879 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2881 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2882 MIB.add(MI.getOperand(0)); // Mask
2883
2884 unsigned Mask = MI.getOperand(0).getImm();
2885 for (unsigned I = 0; I < 8; I++) {
2886 if (Mask & (1 << I))
2887 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2888 }
2889
2890 MI.eraseFromParent(); // The pseudo is gone now.
2891 return BB;
2892}
2893
2895 MachineInstr &MI, MachineBasicBlock *BB) const {
2896
2897 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2898 if (SMEOrigInstr != -1) {
2899 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2900 uint64_t SMEMatrixType =
2901 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2902 switch (SMEMatrixType) {
2904 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2906 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2908 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2910 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2912 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2914 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2915 }
2916 }
2917
2918 switch (MI.getOpcode()) {
2919 default:
2920#ifndef NDEBUG
2921 MI.dump();
2922#endif
2923 llvm_unreachable("Unexpected instruction for custom inserter!");
2924
2925 case AArch64::F128CSEL:
2926 return EmitF128CSEL(MI, BB);
2927 case TargetOpcode::STATEPOINT:
2928 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2929 // while bl call instruction (where statepoint will be lowered at the end)
2930 // has implicit def. This def is early-clobber as it will be set at
2931 // the moment of the call and earlier than any use is read.
2932 // Add this implicit dead def here as a workaround.
2933 MI.addOperand(*MI.getMF(),
2935 AArch64::LR, /*isDef*/ true,
2936 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2937 /*isUndef*/ false, /*isEarlyClobber*/ true));
2938 [[fallthrough]];
2939 case TargetOpcode::STACKMAP:
2940 case TargetOpcode::PATCHPOINT:
2941 return emitPatchPoint(MI, BB);
2942
2943 case TargetOpcode::PATCHABLE_EVENT_CALL:
2944 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2945 return BB;
2946
2947 case AArch64::CATCHRET:
2948 return EmitLoweredCatchRet(MI, BB);
2949
2950 case AArch64::PROBED_STACKALLOC_DYN:
2951 return EmitDynamicProbedAlloc(MI, BB);
2952
2953 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2954 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2955 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2956 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2957 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2958 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2959 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2960 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2961 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2962 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2963 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2964 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2965 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2966 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2967 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2968 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2969 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2970 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2971 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2972 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2973 case AArch64::LDR_ZA_PSEUDO:
2974 return EmitFill(MI, BB);
2975 case AArch64::LDR_TX_PSEUDO:
2976 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2977 case AArch64::STR_TX_PSEUDO:
2978 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2979 case AArch64::ZERO_M_PSEUDO:
2980 return EmitZero(MI, BB);
2981 case AArch64::ZERO_T_PSEUDO:
2982 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2983 }
2984}
2985
2986//===----------------------------------------------------------------------===//
2987// AArch64 Lowering private implementation.
2988//===----------------------------------------------------------------------===//
2989
2990//===----------------------------------------------------------------------===//
2991// Lowering Code
2992//===----------------------------------------------------------------------===//
2993
2994// Forward declarations of SVE fixed length lowering helpers
2999 SelectionDAG &DAG);
3002 EVT VT);
3003
3004/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3005static bool isZerosVector(const SDNode *N) {
3006 // Look through a bit convert.
3007 while (N->getOpcode() == ISD::BITCAST)
3008 N = N->getOperand(0).getNode();
3009
3011 return true;
3012
3013 if (N->getOpcode() != AArch64ISD::DUP)
3014 return false;
3015
3016 auto Opnd0 = N->getOperand(0);
3017 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3018}
3019
3020/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3021/// CC
3023 switch (CC) {
3024 default:
3025 llvm_unreachable("Unknown condition code!");
3026 case ISD::SETNE:
3027 return AArch64CC::NE;
3028 case ISD::SETEQ:
3029 return AArch64CC::EQ;
3030 case ISD::SETGT:
3031 return AArch64CC::GT;
3032 case ISD::SETGE:
3033 return AArch64CC::GE;
3034 case ISD::SETLT:
3035 return AArch64CC::LT;
3036 case ISD::SETLE:
3037 return AArch64CC::LE;
3038 case ISD::SETUGT:
3039 return AArch64CC::HI;
3040 case ISD::SETUGE:
3041 return AArch64CC::HS;
3042 case ISD::SETULT:
3043 return AArch64CC::LO;
3044 case ISD::SETULE:
3045 return AArch64CC::LS;
3046 }
3047}
3048
3049/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3051 AArch64CC::CondCode &CondCode,
3052 AArch64CC::CondCode &CondCode2) {
3053 CondCode2 = AArch64CC::AL;
3054 switch (CC) {
3055 default:
3056 llvm_unreachable("Unknown FP condition!");
3057 case ISD::SETEQ:
3058 case ISD::SETOEQ:
3059 CondCode = AArch64CC::EQ;
3060 break;
3061 case ISD::SETGT:
3062 case ISD::SETOGT:
3063 CondCode = AArch64CC::GT;
3064 break;
3065 case ISD::SETGE:
3066 case ISD::SETOGE:
3067 CondCode = AArch64CC::GE;
3068 break;
3069 case ISD::SETOLT:
3070 CondCode = AArch64CC::MI;
3071 break;
3072 case ISD::SETOLE:
3073 CondCode = AArch64CC::LS;
3074 break;
3075 case ISD::SETONE:
3076 CondCode = AArch64CC::MI;
3077 CondCode2 = AArch64CC::GT;
3078 break;
3079 case ISD::SETO:
3080 CondCode = AArch64CC::VC;
3081 break;
3082 case ISD::SETUO:
3083 CondCode = AArch64CC::VS;
3084 break;
3085 case ISD::SETUEQ:
3086 CondCode = AArch64CC::EQ;
3087 CondCode2 = AArch64CC::VS;
3088 break;
3089 case ISD::SETUGT:
3090 CondCode = AArch64CC::HI;
3091 break;
3092 case ISD::SETUGE:
3093 CondCode = AArch64CC::PL;
3094 break;
3095 case ISD::SETLT:
3096 case ISD::SETULT:
3097 CondCode = AArch64CC::LT;
3098 break;
3099 case ISD::SETLE:
3100 case ISD::SETULE:
3101 CondCode = AArch64CC::LE;
3102 break;
3103 case ISD::SETNE:
3104 case ISD::SETUNE:
3105 CondCode = AArch64CC::NE;
3106 break;
3107 }
3108}
3109
3110/// Convert a DAG fp condition code to an AArch64 CC.
3111/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3112/// should be AND'ed instead of OR'ed.
3114 AArch64CC::CondCode &CondCode,
3115 AArch64CC::CondCode &CondCode2) {
3116 CondCode2 = AArch64CC::AL;
3117 switch (CC) {
3118 default:
3119 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3120 assert(CondCode2 == AArch64CC::AL);
3121 break;
3122 case ISD::SETONE:
3123 // (a one b)
3124 // == ((a olt b) || (a ogt b))
3125 // == ((a ord b) && (a une b))
3126 CondCode = AArch64CC::VC;
3127 CondCode2 = AArch64CC::NE;
3128 break;
3129 case ISD::SETUEQ:
3130 // (a ueq b)
3131 // == ((a uno b) || (a oeq b))
3132 // == ((a ule b) && (a uge b))
3133 CondCode = AArch64CC::PL;
3134 CondCode2 = AArch64CC::LE;
3135 break;
3136 }
3137}
3138
3139/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3140/// CC usable with the vector instructions. Fewer operations are available
3141/// without a real NZCV register, so we have to use less efficient combinations
3142/// to get the same effect.
3144 AArch64CC::CondCode &CondCode,
3145 AArch64CC::CondCode &CondCode2,
3146 bool &Invert) {
3147 Invert = false;
3148 switch (CC) {
3149 default:
3150 // Mostly the scalar mappings work fine.
3151 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3152 break;
3153 case ISD::SETUO:
3154 Invert = true;
3155 [[fallthrough]];
3156 case ISD::SETO:
3157 CondCode = AArch64CC::MI;
3158 CondCode2 = AArch64CC::GE;
3159 break;
3160 case ISD::SETUEQ:
3161 case ISD::SETULT:
3162 case ISD::SETULE:
3163 case ISD::SETUGT:
3164 case ISD::SETUGE:
3165 // All of the compare-mask comparisons are ordered, but we can switch
3166 // between the two by a double inversion. E.g. ULE == !OGT.
3167 Invert = true;
3168 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3169 CondCode, CondCode2);
3170 break;
3171 }
3172}
3173
3175 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3176 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3177 LLVM_DEBUG(dbgs() << "Is imm " << C
3178 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3179 return IsLegal;
3180}
3181
3182// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3183// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3184// can be set differently by this operation. It comes down to whether
3185// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3186// everything is fine. If not then the optimization is wrong. Thus general
3187// comparisons are only valid if op2 != 0.
3188//
3189// So, finally, the only LLVM-native comparisons that don't mention C and V
3190// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3191// the absence of information about op2.
3193 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3194 (CC == ISD::SETEQ || CC == ISD::SETNE);
3195}
3196
3198 SelectionDAG &DAG, SDValue Chain,
3199 bool IsSignaling) {
3200 EVT VT = LHS.getValueType();
3201 assert(VT != MVT::f128);
3202
3203 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3204
3205 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3206 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3207 {Chain, LHS});
3208 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3209 {LHS.getValue(1), RHS});
3210 Chain = RHS.getValue(1);
3211 VT = MVT::f32;
3212 }
3213 unsigned Opcode =
3215 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3216}
3217
3219 const SDLoc &dl, SelectionDAG &DAG) {
3220 EVT VT = LHS.getValueType();
3221 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3222
3223 if (VT.isFloatingPoint()) {
3224 assert(VT != MVT::f128);
3225 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3226 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3227 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3228 VT = MVT::f32;
3229 }
3230 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3231 }
3232
3233 // The CMP instruction is just an alias for SUBS, and representing it as
3234 // SUBS means that it's possible to get CSE with subtract operations.
3235 // A later phase can perform the optimization of setting the destination
3236 // register to WZR/XZR if it ends up being unused.
3237 unsigned Opcode = AArch64ISD::SUBS;
3238
3239 if (isCMN(RHS, CC)) {
3240 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3241 Opcode = AArch64ISD::ADDS;
3242 RHS = RHS.getOperand(1);
3243 } else if (isCMN(LHS, CC)) {
3244 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3245 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3246 Opcode = AArch64ISD::ADDS;
3247 LHS = LHS.getOperand(1);
3248 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3249 if (LHS.getOpcode() == ISD::AND) {
3250 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3251 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3252 // of the signed comparisons.
3253 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3254 DAG.getVTList(VT, MVT_CC),
3255 LHS.getOperand(0),
3256 LHS.getOperand(1));
3257 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3258 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3259 return ANDSNode.getValue(1);
3260 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3261 // Use result of ANDS
3262 return LHS.getValue(1);
3263 }
3264 }
3265
3266 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3267 .getValue(1);
3268}
3269
3270/// \defgroup AArch64CCMP CMP;CCMP matching
3271///
3272/// These functions deal with the formation of CMP;CCMP;... sequences.
3273/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3274/// a comparison. They set the NZCV flags to a predefined value if their
3275/// predicate is false. This allows to express arbitrary conjunctions, for
3276/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3277/// expressed as:
3278/// cmp A
3279/// ccmp B, inv(CB), CA
3280/// check for CB flags
3281///
3282/// This naturally lets us implement chains of AND operations with SETCC
3283/// operands. And we can even implement some other situations by transforming
3284/// them:
3285/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3286/// negating the flags used in a CCMP/FCCMP operations.
3287/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3288/// by negating the flags we test for afterwards. i.e.
3289/// NEG (CMP CCMP CCCMP ...) can be implemented.
3290/// - Note that we can only ever negate all previously processed results.
3291/// What we can not implement by flipping the flags to test is a negation
3292/// of two sub-trees (because the negation affects all sub-trees emitted so
3293/// far, so the 2nd sub-tree we emit would also affect the first).
3294/// With those tools we can implement some OR operations:
3295/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3296/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3297/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3298/// elimination rules from earlier to implement the whole thing as a
3299/// CCMP/FCCMP chain.
3300///
3301/// As complete example:
3302/// or (or (setCA (cmp A)) (setCB (cmp B)))
3303/// (and (setCC (cmp C)) (setCD (cmp D)))"
3304/// can be reassociated to:
3305/// or (and (setCC (cmp C)) setCD (cmp D))
3306// (or (setCA (cmp A)) (setCB (cmp B)))
3307/// can be transformed to:
3308/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3309/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3310/// which can be implemented as:
3311/// cmp C
3312/// ccmp D, inv(CD), CC
3313/// ccmp A, CA, inv(CD)
3314/// ccmp B, CB, inv(CA)
3315/// check for CB flags
3316///
3317/// A counterexample is "or (and A B) (and C D)" which translates to
3318/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3319/// can only implement 1 of the inner (not) operations, but not both!
3320/// @{
3321
3322/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3324 ISD::CondCode CC, SDValue CCOp,
3325 AArch64CC::CondCode Predicate,
3326 AArch64CC::CondCode OutCC,
3327 const SDLoc &DL, SelectionDAG &DAG) {
3328 unsigned Opcode = 0;
3329 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3330
3331 if (LHS.getValueType().isFloatingPoint()) {
3332 assert(LHS.getValueType() != MVT::f128);
3333 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3334 LHS.getValueType() == MVT::bf16) {
3335 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3336 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3337 }
3338 Opcode = AArch64ISD::FCCMP;
3339 } else if (RHS.getOpcode() == ISD::SUB) {
3340 SDValue SubOp0 = RHS.getOperand(0);
3341 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3342 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3343 Opcode = AArch64ISD::CCMN;
3344 RHS = RHS.getOperand(1);
3345 }
3346 }
3347 if (Opcode == 0)
3348 Opcode = AArch64ISD::CCMP;
3349
3350 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3352 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3353 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3354 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3355}
3356
3357/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3358/// expressed as a conjunction. See \ref AArch64CCMP.
3359/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3360/// changing the conditions on the SETCC tests.
3361/// (this means we can call emitConjunctionRec() with
3362/// Negate==true on this sub-tree)
3363/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3364/// cannot do the negation naturally. We are required to
3365/// emit the subtree first in this case.
3366/// \param WillNegate Is true if are called when the result of this
3367/// subexpression must be negated. This happens when the
3368/// outer expression is an OR. We can use this fact to know
3369/// that we have a double negation (or (or ...) ...) that
3370/// can be implemented for free.
3371static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3372 bool &MustBeFirst, bool WillNegate,
3373 unsigned Depth = 0) {
3374 if (!Val.hasOneUse())
3375 return false;
3376 unsigned Opcode = Val->getOpcode();
3377 if (Opcode == ISD::SETCC) {
3378 if (Val->getOperand(0).getValueType() == MVT::f128)
3379 return false;
3380 CanNegate = true;
3381 MustBeFirst = false;
3382 return true;
3383 }
3384 // Protect against exponential runtime and stack overflow.
3385 if (Depth > 6)
3386 return false;
3387 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3388 bool IsOR = Opcode == ISD::OR;
3389 SDValue O0 = Val->getOperand(0);
3390 SDValue O1 = Val->getOperand(1);
3391 bool CanNegateL;
3392 bool MustBeFirstL;
3393 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3394 return false;
3395 bool CanNegateR;
3396 bool MustBeFirstR;
3397 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3398 return false;
3399
3400 if (MustBeFirstL && MustBeFirstR)
3401 return false;
3402
3403 if (IsOR) {
3404 // For an OR expression we need to be able to naturally negate at least
3405 // one side or we cannot do the transformation at all.
3406 if (!CanNegateL && !CanNegateR)
3407 return false;
3408 // If we the result of the OR will be negated and we can naturally negate
3409 // the leafs, then this sub-tree as a whole negates naturally.
3410 CanNegate = WillNegate && CanNegateL && CanNegateR;
3411 // If we cannot naturally negate the whole sub-tree, then this must be
3412 // emitted first.
3413 MustBeFirst = !CanNegate;
3414 } else {
3415 assert(Opcode == ISD::AND && "Must be OR or AND");
3416 // We cannot naturally negate an AND operation.
3417 CanNegate = false;
3418 MustBeFirst = MustBeFirstL || MustBeFirstR;
3419 }
3420 return true;
3421 }
3422 return false;
3423}
3424
3425/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3426/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3427/// Tries to transform the given i1 producing node @p Val to a series compare
3428/// and conditional compare operations. @returns an NZCV flags producing node
3429/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3430/// transformation was not possible.
3431/// \p Negate is true if we want this sub-tree being negated just by changing
3432/// SETCC conditions.
3434 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3435 AArch64CC::CondCode Predicate) {
3436 // We're at a tree leaf, produce a conditional comparison operation.
3437 unsigned Opcode = Val->getOpcode();
3438 if (Opcode == ISD::SETCC) {
3439 SDValue LHS = Val->getOperand(0);
3440 SDValue RHS = Val->getOperand(1);
3441 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3442 bool isInteger = LHS.getValueType().isInteger();
3443 if (Negate)
3444 CC = getSetCCInverse(CC, LHS.getValueType());
3445 SDLoc DL(Val);
3446 // Determine OutCC and handle FP special case.
3447 if (isInteger) {
3448 OutCC = changeIntCCToAArch64CC(CC);
3449 } else {
3450 assert(LHS.getValueType().isFloatingPoint());
3451 AArch64CC::CondCode ExtraCC;
3452 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3453 // Some floating point conditions can't be tested with a single condition
3454 // code. Construct an additional comparison in this case.
3455 if (ExtraCC != AArch64CC::AL) {
3456 SDValue ExtraCmp;
3457 if (!CCOp.getNode())
3458 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3459 else
3460 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3461 ExtraCC, DL, DAG);
3462 CCOp = ExtraCmp;
3463 Predicate = ExtraCC;
3464 }
3465 }
3466
3467 // Produce a normal comparison if we are first in the chain
3468 if (!CCOp)
3469 return emitComparison(LHS, RHS, CC, DL, DAG);
3470 // Otherwise produce a ccmp.
3471 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3472 DAG);
3473 }
3474 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3475
3476 bool IsOR = Opcode == ISD::OR;
3477
3478 SDValue LHS = Val->getOperand(0);
3479 bool CanNegateL;
3480 bool MustBeFirstL;
3481 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3482 assert(ValidL && "Valid conjunction/disjunction tree");
3483 (void)ValidL;
3484
3485 SDValue RHS = Val->getOperand(1);
3486 bool CanNegateR;
3487 bool MustBeFirstR;
3488 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3489 assert(ValidR && "Valid conjunction/disjunction tree");
3490 (void)ValidR;
3491
3492 // Swap sub-tree that must come first to the right side.
3493 if (MustBeFirstL) {
3494 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3495 std::swap(LHS, RHS);
3496 std::swap(CanNegateL, CanNegateR);
3497 std::swap(MustBeFirstL, MustBeFirstR);
3498 }
3499
3500 bool NegateR;
3501 bool NegateAfterR;
3502 bool NegateL;
3503 bool NegateAfterAll;
3504 if (Opcode == ISD::OR) {
3505 // Swap the sub-tree that we can negate naturally to the left.
3506 if (!CanNegateL) {
3507 assert(CanNegateR && "at least one side must be negatable");
3508 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3509 assert(!Negate);
3510 std::swap(LHS, RHS);
3511 NegateR = false;
3512 NegateAfterR = true;
3513 } else {
3514 // Negate the left sub-tree if possible, otherwise negate the result.
3515 NegateR = CanNegateR;
3516 NegateAfterR = !CanNegateR;
3517 }
3518 NegateL = true;
3519 NegateAfterAll = !Negate;
3520 } else {
3521 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3522 assert(!Negate && "Valid conjunction/disjunction tree");
3523
3524 NegateL = false;
3525 NegateR = false;
3526 NegateAfterR = false;
3527 NegateAfterAll = false;
3528 }
3529
3530 // Emit sub-trees.
3531 AArch64CC::CondCode RHSCC;
3532 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3533 if (NegateAfterR)
3534 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3535 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3536 if (NegateAfterAll)
3537 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3538 return CmpL;
3539}
3540
3541/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3542/// In some cases this is even possible with OR operations in the expression.
3543/// See \ref AArch64CCMP.
3544/// \see emitConjunctionRec().
3546 AArch64CC::CondCode &OutCC) {
3547 bool DummyCanNegate;
3548 bool DummyMustBeFirst;
3549 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3550 return SDValue();
3551
3552 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3553}
3554
3555/// @}
3556
3557/// Returns how profitable it is to fold a comparison's operand's shift and/or
3558/// extension operations.
3560 auto isSupportedExtend = [&](SDValue V) {
3561 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3562 return true;
3563
3564 if (V.getOpcode() == ISD::AND)
3565 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3566 uint64_t Mask = MaskCst->getZExtValue();
3567 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3568 }
3569
3570 return false;
3571 };
3572
3573 if (!Op.hasOneUse())
3574 return 0;
3575
3576 if (isSupportedExtend(Op))
3577 return 1;
3578
3579 unsigned Opc = Op.getOpcode();
3580 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3581 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3582 uint64_t Shift = ShiftCst->getZExtValue();
3583 if (isSupportedExtend(Op.getOperand(0)))
3584 return (Shift <= 4) ? 2 : 1;
3585 EVT VT = Op.getValueType();
3586 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3587 return 1;
3588 }
3589
3590 return 0;
3591}
3592
3594 SDValue &AArch64cc, SelectionDAG &DAG,
3595 const SDLoc &dl) {
3596 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3597 EVT VT = RHS.getValueType();
3598 uint64_t C = RHSC->getZExtValue();
3599 if (!isLegalArithImmed(C)) {
3600 // Constant does not fit, try adjusting it by one?
3601 switch (CC) {
3602 default:
3603 break;
3604 case ISD::SETLT:
3605 case ISD::SETGE:
3606 if ((VT == MVT::i32 && C != 0x80000000 &&
3607 isLegalArithImmed((uint32_t)(C - 1))) ||
3608 (VT == MVT::i64 && C != 0x80000000ULL &&
3609 isLegalArithImmed(C - 1ULL))) {
3611 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3612 RHS = DAG.getConstant(C, dl, VT);
3613 }
3614 break;
3615 case ISD::SETULT:
3616 case ISD::SETUGE:
3617 if ((VT == MVT::i32 && C != 0 &&
3618 isLegalArithImmed((uint32_t)(C - 1))) ||
3619 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3621 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3622 RHS = DAG.getConstant(C, dl, VT);
3623 }
3624 break;
3625 case ISD::SETLE:
3626 case ISD::SETGT:
3627 if ((VT == MVT::i32 && C != INT32_MAX &&
3628 isLegalArithImmed((uint32_t)(C + 1))) ||
3629 (VT == MVT::i64 && C != INT64_MAX &&
3630 isLegalArithImmed(C + 1ULL))) {
3632 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3633 RHS = DAG.getConstant(C, dl, VT);
3634 }
3635 break;
3636 case ISD::SETULE:
3637 case ISD::SETUGT:
3638 if ((VT == MVT::i32 && C != UINT32_MAX &&
3639 isLegalArithImmed((uint32_t)(C + 1))) ||
3640 (VT == MVT::i64 && C != UINT64_MAX &&
3641 isLegalArithImmed(C + 1ULL))) {
3643 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3644 RHS = DAG.getConstant(C, dl, VT);
3645 }
3646 break;
3647 }
3648 }
3649 }
3650
3651 // Comparisons are canonicalized so that the RHS operand is simpler than the
3652 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3653 // can fold some shift+extend operations on the RHS operand, so swap the
3654 // operands if that can be done.
3655 //
3656 // For example:
3657 // lsl w13, w11, #1
3658 // cmp w13, w12
3659 // can be turned into:
3660 // cmp w12, w11, lsl #1
3661 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3662 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3663
3665 std::swap(LHS, RHS);
3667 }
3668 }
3669
3670 SDValue Cmp;
3671 AArch64CC::CondCode AArch64CC;
3672 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3673 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3674
3675 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3676 // For the i8 operand, the largest immediate is 255, so this can be easily
3677 // encoded in the compare instruction. For the i16 operand, however, the
3678 // largest immediate cannot be encoded in the compare.
3679 // Therefore, use a sign extending load and cmn to avoid materializing the
3680 // -1 constant. For example,
3681 // movz w1, #65535
3682 // ldrh w0, [x0, #0]
3683 // cmp w0, w1
3684 // >
3685 // ldrsh w0, [x0, #0]
3686 // cmn w0, #1
3687 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3688 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3689 // ensure both the LHS and RHS are truly zero extended and to make sure the
3690 // transformation is profitable.
3691 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3692 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3693 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3694 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3695 int16_t ValueofRHS = RHS->getAsZExtVal();
3696 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3697 SDValue SExt =
3698 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3699 DAG.getValueType(MVT::i16));
3700 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3701 RHS.getValueType()),
3702 CC, dl, DAG);
3703 AArch64CC = changeIntCCToAArch64CC(CC);
3704 }
3705 }
3706
3707 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3708 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3709 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3710 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3711 }
3712 }
3713 }
3714
3715 if (!Cmp) {
3716 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3717 AArch64CC = changeIntCCToAArch64CC(CC);
3718 }
3719 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3720 return Cmp;
3721}
3722
3723static std::pair<SDValue, SDValue>
3725 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3726 "Unsupported value type");
3727 SDValue Value, Overflow;
3728 SDLoc DL(Op);
3729 SDValue LHS = Op.getOperand(0);
3730 SDValue RHS = Op.getOperand(1);
3731 unsigned Opc = 0;
3732 switch (Op.getOpcode()) {
3733 default:
3734 llvm_unreachable("Unknown overflow instruction!");
3735 case ISD::SADDO:
3736 Opc = AArch64ISD::ADDS;
3737 CC = AArch64CC::VS;
3738 break;
3739 case ISD::UADDO:
3740 Opc = AArch64ISD::ADDS;
3741 CC = AArch64CC::HS;
3742 break;
3743 case ISD::SSUBO:
3744 Opc = AArch64ISD::SUBS;
3745 CC = AArch64CC::VS;
3746 break;
3747 case ISD::USUBO:
3748 Opc = AArch64ISD::SUBS;
3749 CC = AArch64CC::LO;
3750 break;
3751 // Multiply needs a little bit extra work.
3752 case ISD::SMULO:
3753 case ISD::UMULO: {
3754 CC = AArch64CC::NE;
3755 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3756 if (Op.getValueType() == MVT::i32) {
3757 // Extend to 64-bits, then perform a 64-bit multiply.
3758 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3759 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3760 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3761 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3762 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3763
3764 // Check that the result fits into a 32-bit integer.
3765 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3766 if (IsSigned) {
3767 // cmp xreg, wreg, sxtw
3768 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3769 Overflow =
3770 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3771 } else {
3772 // tst xreg, #0xffffffff00000000
3773 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3774 Overflow =
3775 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3776 }
3777 break;
3778 }
3779 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3780 // For the 64 bit multiply
3781 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3782 if (IsSigned) {
3783 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3784 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3785 DAG.getConstant(63, DL, MVT::i64));
3786 // It is important that LowerBits is last, otherwise the arithmetic
3787 // shift will not be folded into the compare (SUBS).
3788 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3789 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3790 .getValue(1);
3791 } else {
3792 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3793 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3794 Overflow =
3795 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3796 DAG.getConstant(0, DL, MVT::i64),
3797 UpperBits).getValue(1);
3798 }
3799 break;
3800 }
3801 } // switch (...)
3802
3803 if (Opc) {
3804 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3805
3806 // Emit the AArch64 operation with overflow check.
3807 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3808 Overflow = Value.getValue(1);
3809 }
3810 return std::make_pair(Value, Overflow);
3811}
3812
3813SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3814 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3815 !Subtarget->isNeonAvailable()))
3816 return LowerToScalableOp(Op, DAG);
3817
3818 SDValue Sel = Op.getOperand(0);
3819 SDValue Other = Op.getOperand(1);
3820 SDLoc dl(Sel);
3821
3822 // If the operand is an overflow checking operation, invert the condition
3823 // code and kill the Not operation. I.e., transform:
3824 // (xor (overflow_op_bool, 1))
3825 // -->
3826 // (csel 1, 0, invert(cc), overflow_op_bool)
3827 // ... which later gets transformed to just a cset instruction with an
3828 // inverted condition code, rather than a cset + eor sequence.
3830 // Only lower legal XALUO ops.
3832 return SDValue();
3833
3834 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3835 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3837 SDValue Value, Overflow;
3838 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3839 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3840 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3841 CCVal, Overflow);
3842 }
3843 // If neither operand is a SELECT_CC, give up.
3844 if (Sel.getOpcode() != ISD::SELECT_CC)
3845 std::swap(Sel, Other);
3846 if (Sel.getOpcode() != ISD::SELECT_CC)
3847 return Op;
3848
3849 // The folding we want to perform is:
3850 // (xor x, (select_cc a, b, cc, 0, -1) )
3851 // -->
3852 // (csel x, (xor x, -1), cc ...)
3853 //
3854 // The latter will get matched to a CSINV instruction.
3855
3856 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3857 SDValue LHS = Sel.getOperand(0);
3858 SDValue RHS = Sel.getOperand(1);
3859 SDValue TVal = Sel.getOperand(2);
3860 SDValue FVal = Sel.getOperand(3);
3861
3862 // FIXME: This could be generalized to non-integer comparisons.
3863 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3864 return Op;
3865
3866 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3867 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3868
3869 // The values aren't constants, this isn't the pattern we're looking for.
3870 if (!CFVal || !CTVal)
3871 return Op;
3872
3873 // We can commute the SELECT_CC by inverting the condition. This
3874 // might be needed to make this fit into a CSINV pattern.
3875 if (CTVal->isAllOnes() && CFVal->isZero()) {
3876 std::swap(TVal, FVal);
3877 std::swap(CTVal, CFVal);
3878 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3879 }
3880
3881 // If the constants line up, perform the transform!
3882 if (CTVal->isZero() && CFVal->isAllOnes()) {
3883 SDValue CCVal;
3884 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3885
3886 FVal = Other;
3887 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3888 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3889
3890 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3891 CCVal, Cmp);
3892 }
3893
3894 return Op;
3895}
3896
3897// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3898// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3899// sets 'C' bit to 0.
3901 SDLoc DL(Value);
3902 EVT VT = Value.getValueType();
3903 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3904 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3905 SDValue Cmp =
3906 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3907 return Cmp.getValue(1);
3908}
3909
3910// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3911// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3913 bool Invert) {
3914 assert(Glue.getResNo() == 1);
3915 SDLoc DL(Glue);
3916 SDValue Zero = DAG.getConstant(0, DL, VT);
3917 SDValue One = DAG.getConstant(1, DL, VT);
3918 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3919 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3920 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3921}
3922
3923// Value is 1 if 'V' bit of NZCV is 1, else 0
3925 assert(Glue.getResNo() == 1);
3926 SDLoc DL(Glue);
3927 SDValue Zero = DAG.getConstant(0, DL, VT);
3928 SDValue One = DAG.getConstant(1, DL, VT);
3929 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3930 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3931}
3932
3933// This lowering is inefficient, but it will get cleaned up by
3934// `foldOverflowCheck`
3936 unsigned Opcode, bool IsSigned) {
3937 EVT VT0 = Op.getValue(0).getValueType();
3938 EVT VT1 = Op.getValue(1).getValueType();
3939
3940 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3941 return SDValue();
3942
3943 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3944 SDValue OpLHS = Op.getOperand(0);
3945 SDValue OpRHS = Op.getOperand(1);
3946 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3947
3948 SDLoc DL(Op);
3949 SDVTList VTs = DAG.getVTList(VT0, VT1);
3950
3951 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3952 OpRHS, OpCarryIn);
3953
3954 SDValue OutFlag =
3955 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3956 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3957
3958 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3959}
3960
3962 // Let legalize expand this if it isn't a legal type yet.
3963 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3964 return SDValue();
3965
3966 SDLoc dl(Op);
3968 // The actual operation that sets the overflow or carry flag.
3969 SDValue Value, Overflow;
3970 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3971
3972 // We use 0 and 1 as false and true values.
3973 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3974 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3975
3976 // We use an inverted condition, because the conditional select is inverted
3977 // too. This will allow it to be selected to a single instruction:
3978 // CSINC Wd, WZR, WZR, invert(cond).
3979 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3980 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3981 CCVal, Overflow);
3982
3983 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3984 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3985}
3986
3987// Prefetch operands are:
3988// 1: Address to prefetch
3989// 2: bool isWrite
3990// 3: int locality (0 = no locality ... 3 = extreme locality)
3991// 4: bool isDataCache
3993 SDLoc DL(Op);
3994 unsigned IsWrite = Op.getConstantOperandVal(2);
3995 unsigned Locality = Op.getConstantOperandVal(3);
3996 unsigned IsData = Op.getConstantOperandVal(4);
3997
3998 bool IsStream = !Locality;
3999 // When the locality number is set
4000 if (Locality) {
4001 // The front-end should have filtered out the out-of-range values
4002 assert(Locality <= 3 && "Prefetch locality out-of-range");
4003 // The locality degree is the opposite of the cache speed.
4004 // Put the number the other way around.
4005 // The encoding starts at 0 for level 1
4006 Locality = 3 - Locality;
4007 }
4008
4009 // built the mask value encoding the expected behavior.
4010 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4011 (!IsData << 3) | // IsDataCache bit
4012 (Locality << 1) | // Cache level bits
4013 (unsigned)IsStream; // Stream bit
4014 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4015 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4016 Op.getOperand(1));
4017}
4018
4019SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4020 SelectionDAG &DAG) const {
4021 EVT VT = Op.getValueType();
4022 if (VT.isScalableVector())
4023 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4024
4025 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4026 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4027
4028 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4029 return SDValue();
4030}
4031
4032SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4033 SelectionDAG &DAG) const {
4034 EVT VT = Op.getValueType();
4035 if (VT.isScalableVector())
4036 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4037
4038 bool IsStrict = Op->isStrictFPOpcode();
4039 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4040 EVT SrcVT = SrcVal.getValueType();
4041 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4042
4043 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4044 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4045
4046 // Expand cases where the result type is BF16 but we don't have hardware
4047 // instructions to lower it.
4048 if (VT.getScalarType() == MVT::bf16 &&
4049 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4050 Subtarget->hasBF16())) {
4051 SDLoc dl(Op);
4052 SDValue Narrow = SrcVal;
4053 SDValue NaN;
4054 EVT I32 = SrcVT.changeElementType(MVT::i32);
4055 EVT F32 = SrcVT.changeElementType(MVT::f32);
4056 if (SrcVT.getScalarType() == MVT::f32) {
4057 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4058 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4059 if (!NeverSNaN) {
4060 // Set the quiet bit.
4061 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4062 DAG.getConstant(0x400000, dl, I32));
4063 }
4064 } else if (SrcVT.getScalarType() == MVT::f64) {
4065 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4066 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4067 } else {
4068 return SDValue();
4069 }
4070 if (!Trunc) {
4071 SDValue One = DAG.getConstant(1, dl, I32);
4072 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4073 DAG.getShiftAmountConstant(16, I32, dl));
4074 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4075 SDValue RoundingBias =
4076 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4077 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4078 }
4079
4080 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4081 // 0x80000000.
4082 if (NaN) {
4083 SDValue IsNaN = DAG.getSetCC(
4084 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4085 SrcVal, SrcVal, ISD::SETUO);
4086 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4087 }
4088
4089 // Now that we have rounded, shift the bits into position.
4090 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4091 DAG.getShiftAmountConstant(16, I32, dl));
4092 if (VT.isVector()) {
4093 EVT I16 = I32.changeVectorElementType(MVT::i16);
4094 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4095 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4096 }
4097 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4098 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4099 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4100 : Result;
4101 }
4102
4103 if (SrcVT != MVT::f128) {
4104 // Expand cases where the input is a vector bigger than NEON.
4106 return SDValue();
4107
4108 // It's legal except when f128 is involved
4109 return Op;
4110 }
4111
4112 return SDValue();
4113}
4114
4115SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4116 SelectionDAG &DAG) const {
4117 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4118 // Any additional optimization in this function should be recorded
4119 // in the cost tables.
4120 bool IsStrict = Op->isStrictFPOpcode();
4121 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4122 EVT VT = Op.getValueType();
4123
4124 if (VT.isScalableVector()) {
4125 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4128 return LowerToPredicatedOp(Op, DAG, Opcode);
4129 }
4130
4131 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4132 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4133 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4134
4135 unsigned NumElts = InVT.getVectorNumElements();
4136
4137 // f16 conversions are promoted to f32 when full fp16 is not supported.
4138 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4139 InVT.getVectorElementType() == MVT::bf16) {
4140 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4141 SDLoc dl(Op);
4142 if (IsStrict) {
4143 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4144 {Op.getOperand(0), Op.getOperand(1)});
4145 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4146 {Ext.getValue(1), Ext.getValue(0)});
4147 }
4148 return DAG.getNode(
4149 Op.getOpcode(), dl, Op.getValueType(),
4150 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4151 }
4152
4153 uint64_t VTSize = VT.getFixedSizeInBits();
4154 uint64_t InVTSize = InVT.getFixedSizeInBits();
4155 if (VTSize < InVTSize) {
4156 SDLoc dl(Op);
4157 if (IsStrict) {
4159 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4160 {Op.getOperand(0), Op.getOperand(1)});
4161 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4162 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4163 }
4164 SDValue Cv =
4165 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4166 Op.getOperand(0));
4167 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4168 }
4169
4170 if (VTSize > InVTSize) {
4171 SDLoc dl(Op);
4172 MVT ExtVT =
4175 if (IsStrict) {
4176 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4177 {Op.getOperand(0), Op.getOperand(1)});
4178 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4179 {Ext.getValue(1), Ext.getValue(0)});
4180 }
4181 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4182 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4183 }
4184
4185 // Use a scalar operation for conversions between single-element vectors of
4186 // the same size.
4187 if (NumElts == 1) {
4188 SDLoc dl(Op);
4189 SDValue Extract = DAG.getNode(
4191 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4192 EVT ScalarVT = VT.getScalarType();
4193 if (IsStrict)
4194 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4195 {Op.getOperand(0), Extract});
4196 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4197 }
4198
4199 // Type changing conversions are illegal.
4200 return Op;
4201}
4202
4203SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4204 SelectionDAG &DAG) const {
4205 bool IsStrict = Op->isStrictFPOpcode();
4206 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4207
4208 if (SrcVal.getValueType().isVector())
4209 return LowerVectorFP_TO_INT(Op, DAG);
4210
4211 // f16 conversions are promoted to f32 when full fp16 is not supported.
4212 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4213 SrcVal.getValueType() == MVT::bf16) {
4214 SDLoc dl(Op);
4215 if (IsStrict) {
4216 SDValue Ext =
4217 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4218 {Op.getOperand(0), SrcVal});
4219 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4220 {Ext.getValue(1), Ext.getValue(0)});
4221 }
4222 return DAG.getNode(
4223 Op.getOpcode(), dl, Op.getValueType(),
4224 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4225 }
4226
4227 if (SrcVal.getValueType() != MVT::f128) {
4228 // It's legal except when f128 is involved
4229 return Op;
4230 }
4231
4232 return SDValue();
4233}
4234
4235SDValue
4236AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4237 SelectionDAG &DAG) const {
4238 // AArch64 FP-to-int conversions saturate to the destination element size, so
4239 // we can lower common saturating conversions to simple instructions.
4240 SDValue SrcVal = Op.getOperand(0);
4241 EVT SrcVT = SrcVal.getValueType();
4242 EVT DstVT = Op.getValueType();
4243 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4244
4245 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4246 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4247 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4248 assert(SatWidth <= DstElementWidth &&
4249 "Saturation width cannot exceed result width");
4250
4251 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4252 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4253 // types, so this is hard to reach.
4254 if (DstVT.isScalableVector())
4255 return SDValue();
4256
4257 EVT SrcElementVT = SrcVT.getVectorElementType();
4258
4259 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4260 if ((SrcElementVT == MVT::f16 &&
4261 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4262 SrcElementVT == MVT::bf16) {
4263 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4264 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4265 SrcVT = F32VT;
4266 SrcElementVT = MVT::f32;
4267 SrcElementWidth = 32;
4268 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4269 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4270 return SDValue();
4271
4272 SDLoc DL(Op);
4273 // Cases that we can emit directly.
4274 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4275 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4276 DAG.getValueType(DstVT.getScalarType()));
4277
4278 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4279 // result. This is only valid if the legal cvt is larger than the saturate
4280 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4281 // (at least until sqxtn is selected).
4282 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4283 return SDValue();
4284
4285 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4286 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4287 DAG.getValueType(IntVT.getScalarType()));
4288 SDValue Sat;
4289 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4290 SDValue MinC = DAG.getConstant(
4291 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4292 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4293 SDValue MaxC = DAG.getConstant(
4294 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4295 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4296 } else {
4297 SDValue MinC = DAG.getConstant(
4298 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4299 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4300 }
4301
4302 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4303}
4304
4305SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4306 SelectionDAG &DAG) const {
4307 // AArch64 FP-to-int conversions saturate to the destination register size, so
4308 // we can lower common saturating conversions to simple instructions.
4309 SDValue SrcVal = Op.getOperand(0);
4310 EVT SrcVT = SrcVal.getValueType();
4311
4312 if (SrcVT.isVector())
4313 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4314
4315 EVT DstVT = Op.getValueType();
4316 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4317 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4318 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4319 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4320
4321 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4322 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4323 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4324 SrcVT = MVT::f32;
4325 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4326 SrcVT != MVT::bf16)
4327 return SDValue();
4328
4329 SDLoc DL(Op);
4330 // Cases that we can emit directly.
4331 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4332 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4333 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4334 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4335 DAG.getValueType(DstVT));
4336
4337 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4338 // result. This is only valid if the legal cvt is larger than the saturate
4339 // width.
4340 if (DstWidth < SatWidth)
4341 return SDValue();
4342
4343 SDValue NativeCvt =
4344 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4345 SDValue Sat;
4346 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4347 SDValue MinC = DAG.getConstant(
4348 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4349 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4350 SDValue MaxC = DAG.getConstant(
4351 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4352 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4353 } else {
4354 SDValue MinC = DAG.getConstant(
4355 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4356 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4357 }
4358
4359 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4360}
4361
4362SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4363 SelectionDAG &DAG) const {
4364 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4365 // Any additional optimization in this function should be recorded
4366 // in the cost tables.
4367 bool IsStrict = Op->isStrictFPOpcode();
4368 EVT VT = Op.getValueType();
4369 SDLoc dl(Op);
4370 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4371 EVT InVT = In.getValueType();
4372 unsigned Opc = Op.getOpcode();
4373 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4374
4375 if (VT.isScalableVector()) {
4376 if (InVT.getVectorElementType() == MVT::i1) {
4377 // We can't directly extend an SVE predicate; extend it first.
4378 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4379 EVT CastVT = getPromotedVTForPredicate(InVT);
4380 In = DAG.getNode(CastOpc, dl, CastVT, In);
4381 return DAG.getNode(Opc, dl, VT, In);
4382 }
4383
4384 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4386 return LowerToPredicatedOp(Op, DAG, Opcode);
4387 }
4388
4389 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4390 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4391 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4392
4393 // Promote bf16 conversions to f32.
4394 if (VT.getVectorElementType() == MVT::bf16) {
4395 EVT F32 = VT.changeElementType(MVT::f32);
4396 if (IsStrict) {
4397 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4398 {Op.getOperand(0), In});
4399 return DAG.getNode(
4400 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4401 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4402 }
4403 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4404 DAG.getNode(Op.getOpcode(), dl, F32, In),
4405 DAG.getIntPtrConstant(0, dl));
4406 }
4407
4408 uint64_t VTSize = VT.getFixedSizeInBits();
4409 uint64_t InVTSize = InVT.getFixedSizeInBits();
4410 if (VTSize < InVTSize) {
4411 MVT CastVT =
4413 InVT.getVectorNumElements());
4414 if (IsStrict) {
4415 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4416 {Op.getOperand(0), In});
4417 return DAG.getNode(
4418 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4419 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4420 }
4421 In = DAG.getNode(Opc, dl, CastVT, In);
4422 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4423 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4424 }
4425
4426 if (VTSize > InVTSize) {
4427 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4429 In = DAG.getNode(CastOpc, dl, CastVT, In);
4430 if (IsStrict)
4431 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4432 return DAG.getNode(Opc, dl, VT, In);
4433 }
4434
4435 // Use a scalar operation for conversions between single-element vectors of
4436 // the same size.
4437 if (VT.getVectorNumElements() == 1) {
4438 SDValue Extract = DAG.getNode(
4440 In, DAG.getConstant(0, dl, MVT::i64));
4441 EVT ScalarVT = VT.getScalarType();
4442 if (IsStrict)
4443 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4444 {Op.getOperand(0), Extract});
4445 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4446 }
4447
4448 return Op;
4449}
4450
4451SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4452 SelectionDAG &DAG) const {
4453 if (Op.getValueType().isVector())
4454 return LowerVectorINT_TO_FP(Op, DAG);
4455
4456 bool IsStrict = Op->isStrictFPOpcode();
4457 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4458
4459 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4460 Op->getOpcode() == ISD::SINT_TO_FP;
4461
4462 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4463 SDLoc dl(Op);
4464 if (IsStrict) {
4465 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4466 {Op.getOperand(0), SrcVal});
4467 return DAG.getNode(
4468 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4469 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4470 }
4471 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4472 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4473 DAG.getIntPtrConstant(0, dl));
4474 };
4475
4476 if (Op.getValueType() == MVT::bf16) {
4477 unsigned MaxWidth = IsSigned
4478 ? DAG.ComputeMaxSignificantBits(SrcVal)
4479 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4480 // bf16 conversions are promoted to f32 when converting from i16.
4481 if (MaxWidth <= 24) {
4482 return IntToFpViaPromotion(MVT::f32);
4483 }
4484
4485 // bf16 conversions are promoted to f64 when converting from i32.
4486 if (MaxWidth <= 53) {
4487 return IntToFpViaPromotion(MVT::f64);
4488 }
4489
4490 // We need to be careful about i64 -> bf16.
4491 // Consider an i32 22216703.
4492 // This number cannot be represented exactly as an f32 and so a itofp will
4493 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4494 // However, the correct bf16 was supposed to be 22151168.0
4495 // We need to use sticky rounding to get this correct.
4496 if (SrcVal.getValueType() == MVT::i64) {
4497 SDLoc DL(Op);
4498 // This algorithm is equivalent to the following:
4499 // uint64_t SrcHi = SrcVal & ~0xfffull;
4500 // uint64_t SrcLo = SrcVal & 0xfffull;
4501 // uint64_t Highest = SrcVal >> 53;
4502 // bool HasHighest = Highest != 0;
4503 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4504 // double Rounded = static_cast<double>(ToRound);
4505 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4506 // uint64_t HasLo = SrcLo != 0;
4507 // bool NeedsAdjustment = HasHighest & HasLo;
4508 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4509 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4510 // return static_cast<__bf16>(Adjusted);
4511 //
4512 // Essentially, what happens is that SrcVal either fits perfectly in a
4513 // double-precision value or it is too big. If it is sufficiently small,
4514 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4515 // ensure that u64 -> double has no rounding error by only using the 52
4516 // MSB of the input. The low order bits will get merged into a sticky bit
4517 // which will avoid issues incurred by double rounding.
4518
4519 // Signed conversion is more or less like so:
4520 // copysign((__bf16)abs(SrcVal), SrcVal)
4521 SDValue SignBit;
4522 if (IsSigned) {
4523 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4524 DAG.getConstant(1ull << 63, DL, MVT::i64));
4525 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4526 }
4527 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4528 DAG.getConstant(~0xfffull, DL, MVT::i64));
4529 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4530 DAG.getConstant(0xfffull, DL, MVT::i64));
4532 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4533 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4534 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4535 SDValue ToRound =
4536 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4537 SDValue Rounded =
4538 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4539 {Op.getOperand(0), ToRound})
4540 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4541
4542 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4543 if (SignBit) {
4544 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4545 }
4546
4547 SDValue HasHighest = DAG.getSetCC(
4548 DL,
4549 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4550 Highest, Zero64, ISD::SETNE);
4551
4552 SDValue HasLo = DAG.getSetCC(
4553 DL,
4554 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4555 SrcLo, Zero64, ISD::SETNE);
4556
4557 SDValue NeedsAdjustment =
4558 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4559 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4560
4561 SDValue AdjustedBits =
4562 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4563 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4564 return IsStrict
4566 {Op.getValueType(), MVT::Other},
4567 {Rounded.getValue(1), Adjusted,
4568 DAG.getIntPtrConstant(0, DL)})
4569 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4570 DAG.getIntPtrConstant(0, DL, true));
4571 }
4572 }
4573
4574 // f16 conversions are promoted to f32 when full fp16 is not supported.
4575 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4576 return IntToFpViaPromotion(MVT::f32);
4577 }
4578
4579 // i128 conversions are libcalls.
4580 if (SrcVal.getValueType() == MVT::i128)
4581 return SDValue();
4582
4583 // Other conversions are legal, unless it's to the completely software-based
4584 // fp128.
4585 if (Op.getValueType() != MVT::f128)
4586 return Op;
4587 return SDValue();
4588}
4589
4590SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4591 SelectionDAG &DAG) const {
4592 // For iOS, we want to call an alternative entry point: __sincos_stret,
4593 // which returns the values in two S / D registers.
4594 SDLoc dl(Op);
4595 SDValue Arg = Op.getOperand(0);
4596 EVT ArgVT = Arg.getValueType();
4597 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4598
4600 ArgListEntry Entry;
4601
4602 Entry.Node = Arg;
4603 Entry.Ty = ArgTy;
4604 Entry.IsSExt = false;
4605 Entry.IsZExt = false;
4606 Args.push_back(Entry);
4607
4608 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4609 : RTLIB::SINCOS_STRET_F32;
4610 const char *LibcallName = getLibcallName(LC);
4611 SDValue Callee =
4612 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4613
4614 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4616 CLI.setDebugLoc(dl)
4617 .setChain(DAG.getEntryNode())
4618 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4619
4620 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4621 return CallResult.first;
4622}
4623
4624static MVT getSVEContainerType(EVT ContentTy);
4625
4626SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4627 SelectionDAG &DAG) const {
4628 EVT OpVT = Op.getValueType();
4629 EVT ArgVT = Op.getOperand(0).getValueType();
4630
4632 return LowerFixedLengthBitcastToSVE(Op, DAG);
4633
4634 if (OpVT.isScalableVector()) {
4635 // Bitcasting between unpacked vector types of different element counts is
4636 // not a NOP because the live elements are laid out differently.
4637 // 01234567
4638 // e.g. nxv2i32 = XX??XX??
4639 // nxv4f16 = X?X?X?X?
4640 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4641 return SDValue();
4642
4643 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4644 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4645 "Expected int->fp bitcast!");
4646 SDValue ExtResult =
4648 Op.getOperand(0));
4649 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4650 }
4651 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4652 }
4653
4654 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4655 return SDValue();
4656
4657 // Bitcasts between f16 and bf16 are legal.
4658 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4659 return Op;
4660
4661 assert(ArgVT == MVT::i16);
4662 SDLoc DL(Op);
4663
4664 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4665 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4666 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4667}
4668
4669static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4670 if (OrigVT.getSizeInBits() >= 64)
4671 return OrigVT;
4672
4673 assert(OrigVT.isSimple() && "Expecting a simple value type");
4674
4675 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4676 switch (OrigSimpleTy) {
4677 default: llvm_unreachable("Unexpected Vector Type");
4678 case MVT::v2i8:
4679 case MVT::v2i16:
4680 return MVT::v2i32;
4681 case MVT::v4i8:
4682 return MVT::v4i16;
4683 }
4684}
4685
4687 const EVT &OrigTy,
4688 const EVT &ExtTy,
4689 unsigned ExtOpcode) {
4690 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4691 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4692 // 64-bits we need to insert a new extension so that it will be 64-bits.
4693 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4694 if (OrigTy.getSizeInBits() >= 64)
4695 return N;
4696
4697 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4698 EVT NewVT = getExtensionTo64Bits(OrigTy);
4699
4700 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4701}
4702
4703// Returns lane if Op extracts from a two-element vector and lane is constant
4704// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4705static std::optional<uint64_t>
4707 SDNode *OpNode = Op.getNode();
4708 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4709 return std::nullopt;
4710
4711 EVT VT = OpNode->getOperand(0).getValueType();
4712 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4713 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4714 return std::nullopt;
4715
4716 return C->getZExtValue();
4717}
4718
4720 bool isSigned) {
4721 EVT VT = N.getValueType();
4722
4723 if (N.getOpcode() != ISD::BUILD_VECTOR)
4724 return false;
4725
4726 for (const SDValue &Elt : N->op_values()) {
4727 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4728 unsigned EltSize = VT.getScalarSizeInBits();
4729 unsigned HalfSize = EltSize / 2;
4730 if (isSigned) {
4731 if (!isIntN(HalfSize, C->getSExtValue()))
4732 return false;
4733 } else {
4734 if (!isUIntN(HalfSize, C->getZExtValue()))
4735 return false;
4736 }
4737 continue;
4738 }
4739 return false;
4740 }
4741
4742 return true;
4743}
4744
4746 EVT VT = N.getValueType();
4747 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4748
4749 unsigned NumElts = VT.getVectorNumElements();
4750 unsigned OrigEltSize = VT.getScalarSizeInBits();
4751 unsigned EltSize = OrigEltSize / 2;
4752 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4753
4754 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4755 if (DAG.MaskedValueIsZero(N, HiBits))
4756 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4757
4758 if (ISD::isExtOpcode(N.getOpcode()))
4759 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4760 N.getOperand(0).getValueType(), VT,
4761 N.getOpcode());
4762
4763 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4764 SDLoc dl(N);
4766 for (unsigned i = 0; i != NumElts; ++i) {
4767 const APInt &CInt = N.getConstantOperandAPInt(i);
4768 // Element types smaller than 32 bits are not legal, so use i32 elements.
4769 // The values are implicitly truncated so sext vs. zext doesn't matter.
4770 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4771 }
4772 return DAG.getBuildVector(TruncVT, dl, Ops);
4773}
4774
4776 return N.getOpcode() == ISD::SIGN_EXTEND ||
4777 N.getOpcode() == ISD::ANY_EXTEND ||
4778 isExtendedBUILD_VECTOR(N, DAG, true);
4779}
4780
4782 return N.getOpcode() == ISD::ZERO_EXTEND ||
4783 N.getOpcode() == ISD::ANY_EXTEND ||
4784 isExtendedBUILD_VECTOR(N, DAG, false);
4785}
4786
4788 unsigned Opcode = N.getOpcode();
4789 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4790 SDValue N0 = N.getOperand(0);
4791 SDValue N1 = N.getOperand(1);
4792 return N0->hasOneUse() && N1->hasOneUse() &&
4793 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4794 }
4795 return false;
4796}
4797
4799 unsigned Opcode = N.getOpcode();
4800 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4801 SDValue N0 = N.getOperand(0);
4802 SDValue N1 = N.getOperand(1);
4803 return N0->hasOneUse() && N1->hasOneUse() &&
4804 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4805 }
4806 return false;
4807}
4808
4809SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4810 SelectionDAG &DAG) const {
4811 // The rounding mode is in bits 23:22 of the FPSCR.
4812 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4813 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4814 // so that the shift + and get folded into a bitfield extract.
4815 SDLoc dl(Op);
4816
4817 SDValue Chain = Op.getOperand(0);
4818 SDValue FPCR_64 = DAG.getNode(
4819 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4820 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4821 Chain = FPCR_64.getValue(1);
4822 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4823 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4824 DAG.getConstant(1U << 22, dl, MVT::i32));
4825 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4826 DAG.getConstant(22, dl, MVT::i32));
4827 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4828 DAG.getConstant(3, dl, MVT::i32));
4829 return DAG.getMergeValues({AND, Chain}, dl);
4830}
4831
4832SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4833 SelectionDAG &DAG) const {
4834 SDLoc DL(Op);
4835 SDValue Chain = Op->getOperand(0);
4836 SDValue RMValue = Op->getOperand(1);
4837
4838 // The rounding mode is in bits 23:22 of the FPCR.
4839 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4840 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4841 // ((arg - 1) & 3) << 22).
4842 //
4843 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4844 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4845 // generated llvm.set.rounding to ensure this condition.
4846
4847 // Calculate new value of FPCR[23:22].
4848 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4849 DAG.getConstant(1, DL, MVT::i32));
4850 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4851 DAG.getConstant(0x3, DL, MVT::i32));
4852 RMValue =
4853 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4854 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4855 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4856
4857 // Get current value of FPCR.
4858 SDValue Ops[] = {
4859 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4860 SDValue FPCR =
4861 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4862 Chain = FPCR.getValue(1);
4863 FPCR = FPCR.getValue(0);
4864
4865 // Put new rounding mode into FPSCR[23:22].
4866 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4867 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4868 DAG.getConstant(RMMask, DL, MVT::i64));
4869 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4870 SDValue Ops2[] = {
4871 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4872 FPCR};
4873 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4874}
4875
4876SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
4877 SelectionDAG &DAG) const {
4878 SDLoc DL(Op);
4879 SDValue Chain = Op->getOperand(0);
4880
4881 // Get current value of FPCR.
4882 SDValue Ops[] = {
4883 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4884 SDValue FPCR =
4885 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4886 Chain = FPCR.getValue(1);
4887 FPCR = FPCR.getValue(0);
4888
4889 // Truncate FPCR to 32 bits.
4890 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
4891
4892 return DAG.getMergeValues({Result, Chain}, DL);
4893}
4894
4895SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
4896 SelectionDAG &DAG) const {
4897 SDLoc DL(Op);
4898 SDValue Chain = Op->getOperand(0);
4899 SDValue Mode = Op->getOperand(1);
4900
4901 // Extend the specified value to 64 bits.
4902 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
4903
4904 // Set new value of FPCR.
4905 SDValue Ops2[] = {
4906 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
4907 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4908}
4909
4910SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
4911 SelectionDAG &DAG) const {
4912 SDLoc DL(Op);
4913 SDValue Chain = Op->getOperand(0);
4914
4915 // Get current value of FPCR.
4916 SDValue Ops[] = {
4917 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4918 SDValue FPCR =
4919 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4920 Chain = FPCR.getValue(1);
4921 FPCR = FPCR.getValue(0);
4922
4923 // Clear bits that are not reserved.
4924 SDValue FPSCRMasked = DAG.getNode(
4925 ISD::AND, DL, MVT::i64, FPCR,
4927
4928 // Set new value of FPCR.
4929 SDValue Ops2[] = {Chain,
4930 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4931 FPSCRMasked};
4932 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4933}
4934
4935static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4936 SDLoc DL, bool &IsMLA) {
4937 bool IsN0SExt = isSignExtended(N0, DAG);
4938 bool IsN1SExt = isSignExtended(N1, DAG);
4939 if (IsN0SExt && IsN1SExt)
4940 return AArch64ISD::SMULL;
4941
4942 bool IsN0ZExt = isZeroExtended(N0, DAG);
4943 bool IsN1ZExt = isZeroExtended(N1, DAG);
4944
4945 if (IsN0ZExt && IsN1ZExt)
4946 return AArch64ISD::UMULL;
4947
4948 // Select SMULL if we can replace zext with sext.
4949 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4950 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4951 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4952 SDValue ZextOperand;
4953 if (IsN0ZExt)
4954 ZextOperand = N0.getOperand(0);
4955 else
4956 ZextOperand = N1.getOperand(0);
4957 if (DAG.SignBitIsZero(ZextOperand)) {
4958 SDValue NewSext =
4959 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4960 if (IsN0ZExt)
4961 N0 = NewSext;
4962 else
4963 N1 = NewSext;
4964 return AArch64ISD::SMULL;
4965 }
4966 }
4967
4968 // Select UMULL if we can replace the other operand with an extend.
4969 if (IsN0ZExt || IsN1ZExt) {
4970 EVT VT = N0.getValueType();
4972 VT.getScalarSizeInBits() / 2);
4973 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4974 return AArch64ISD::UMULL;
4975 }
4976
4977 if (!IsN1SExt && !IsN1ZExt)
4978 return 0;
4979
4980 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4981 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4982 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4983 IsMLA = true;
4984 return AArch64ISD::SMULL;
4985 }
4986 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4987 IsMLA = true;
4988 return AArch64ISD::UMULL;
4989 }
4990 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4991 std::swap(N0, N1);
4992 IsMLA = true;
4993 return AArch64ISD::UMULL;
4994 }
4995 return 0;
4996}
4997
4998SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4999 EVT VT = Op.getValueType();
5000
5001 bool OverrideNEON = !Subtarget->isNeonAvailable();
5002 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5003 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5004
5005 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5006 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5007 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5008 "unexpected type for custom-lowering ISD::MUL");
5009 SDValue N0 = Op.getOperand(0);
5010 SDValue N1 = Op.getOperand(1);
5011 bool isMLA = false;
5012 EVT OVT = VT;
5013 if (VT.is64BitVector()) {
5014 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5015 isNullConstant(N0.getOperand(1)) &&
5017 isNullConstant(N1.getOperand(1))) {
5018 N0 = N0.getOperand(0);
5019 N1 = N1.getOperand(0);
5020 VT = N0.getValueType();
5021 } else {
5022 if (VT == MVT::v1i64) {
5023 if (Subtarget->hasSVE())
5024 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5025 // Fall through to expand this. It is not legal.
5026 return SDValue();
5027 } else
5028 // Other vector multiplications are legal.
5029 return Op;
5030 }
5031 }
5032
5033 SDLoc DL(Op);
5034 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5035
5036 if (!NewOpc) {
5037 if (VT.getVectorElementType() == MVT::i64) {
5038 // If SVE is available then i64 vector multiplications can also be made
5039 // legal.
5040 if (Subtarget->hasSVE())
5041 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5042 // Fall through to expand this. It is not legal.
5043 return SDValue();
5044 } else
5045 // Other vector multiplications are legal.
5046 return Op;
5047 }
5048
5049 // Legalize to a S/UMULL instruction
5050 SDValue Op0;
5051 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5052 if (!isMLA) {
5053 Op0 = skipExtensionForVectorMULL(N0, DAG);
5055 Op1.getValueType().is64BitVector() &&
5056 "unexpected types for extended operands to VMULL");
5057 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5058 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5059 DAG.getConstant(0, DL, MVT::i64));
5060 }
5061 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5062 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5063 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5066 EVT Op1VT = Op1.getValueType();
5067 return DAG.getNode(
5069 DAG.getNode(N0.getOpcode(), DL, VT,
5070 DAG.getNode(NewOpc, DL, VT,
5071 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5072 DAG.getNode(NewOpc, DL, VT,
5073 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5074 DAG.getConstant(0, DL, MVT::i64));
5075}
5076
5077static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5078 int Pattern) {
5079 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5080 return DAG.getConstant(1, DL, MVT::nxv1i1);
5081 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5082 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5083}
5084
5086 bool IsSigned, bool IsEqual) {
5087 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5088 !isa<ConstantSDNode>(Op.getOperand(2)))
5089 return SDValue();
5090
5091 SDLoc dl(Op);
5092 APInt X = Op.getConstantOperandAPInt(1);
5093 APInt Y = Op.getConstantOperandAPInt(2);
5094 bool Overflow;
5095 APInt NumActiveElems =
5096 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5097
5098 if (Overflow)
5099 return SDValue();
5100
5101 if (IsEqual) {
5102 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5103 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5104 : NumActiveElems.uadd_ov(One, Overflow);
5105 if (Overflow)
5106 return SDValue();
5107 }
5108
5109 std::optional<unsigned> PredPattern =
5111 unsigned MinSVEVectorSize = std::max(
5113 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5114 if (PredPattern != std::nullopt &&
5115 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5116 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5117
5118 return SDValue();
5119}
5120
5121// Returns a safe bitcast between two scalable vector predicates, where
5122// any newly created lanes from a widening bitcast are defined as zero.
5124 SDLoc DL(Op);
5125 EVT InVT = Op.getValueType();
5126
5127 assert(InVT.getVectorElementType() == MVT::i1 &&
5128 VT.getVectorElementType() == MVT::i1 &&
5129 "Expected a predicate-to-predicate bitcast");
5131 InVT.isScalableVector() &&
5132 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5133 "Only expect to cast between legal scalable predicate types!");
5134
5135 // Return the operand if the cast isn't changing type,
5136 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5137 if (InVT == VT)
5138 return Op;
5139
5140 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5141
5142 // We only have to zero the lanes if new lanes are being defined, e.g. when
5143 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5144 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5145 // we can return here.
5146 if (InVT.bitsGT(VT))
5147 return Reinterpret;
5148
5149 // Check if the other lanes are already known to be zeroed by
5150 // construction.
5152 return Reinterpret;
5153
5154 // Zero the newly introduced lanes.
5155 SDValue Mask = DAG.getConstant(1, DL, InVT);
5156 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5157 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5158}
5159
5160SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5161 SDValue Chain, SDLoc DL,
5162 EVT VT) const {
5163 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5165 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5166 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5169 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5171 RetTy, Callee, std::move(Args));
5172 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5173 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5174 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5175 Mask);
5176}
5177
5178// Lower an SME LDR/STR ZA intrinsic
5179// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5180// folded into the instruction
5181// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5182// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5183// and tile slice registers
5184// ldr(%tileslice, %ptr, %vecnum)
5185// ->
5186// %svl = rdsvl
5187// %ptr2 = %ptr + %svl * %vecnum
5188// %tileslice2 = %tileslice + %vecnum
5189// ldr [%tileslice2, 0], [%ptr2, 0]
5190// Case 3: If the vecnum is an immediate out of range, then the same is done as
5191// case 2, but the base and slice registers are modified by the greatest
5192// multiple of 15 lower than the vecnum and the remainder is folded into the
5193// instruction. This means that successive loads and stores that are offset from
5194// each other can share the same base and slice register updates.
5195// ldr(%tileslice, %ptr, 22)
5196// ldr(%tileslice, %ptr, 23)
5197// ->
5198// %svl = rdsvl
5199// %ptr2 = %ptr + %svl * 15
5200// %tileslice2 = %tileslice + 15
5201// ldr [%tileslice2, 7], [%ptr2, 7]
5202// ldr [%tileslice2, 8], [%ptr2, 8]
5203// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5204// operand and the immediate can be folded into the instruction, like case 2.
5205// ldr(%tileslice, %ptr, %vecnum + 7)
5206// ldr(%tileslice, %ptr, %vecnum + 8)
5207// ->
5208// %svl = rdsvl
5209// %ptr2 = %ptr + %svl * %vecnum
5210// %tileslice2 = %tileslice + %vecnum
5211// ldr [%tileslice2, 7], [%ptr2, 7]
5212// ldr [%tileslice2, 8], [%ptr2, 8]
5213// Case 5: The vecnum being an add of an immediate out of range is also handled,
5214// in which case the same remainder logic as case 3 is used.
5216 SDLoc DL(N);
5217
5218 SDValue TileSlice = N->getOperand(2);
5219 SDValue Base = N->getOperand(3);
5220 SDValue VecNum = N->getOperand(4);
5221 int32_t ConstAddend = 0;
5222 SDValue VarAddend = VecNum;
5223
5224 // If the vnum is an add of an immediate, we can fold it into the instruction
5225 if (VecNum.getOpcode() == ISD::ADD &&
5226 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5227 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5228 VarAddend = VecNum.getOperand(0);
5229 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5230 ConstAddend = ImmNode->getSExtValue();
5231 VarAddend = SDValue();
5232 }
5233
5234 int32_t ImmAddend = ConstAddend % 16;
5235 if (int32_t C = (ConstAddend - ImmAddend)) {
5236 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5237 VarAddend = VarAddend
5238 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5239 : CVal;
5240 }
5241
5242 if (VarAddend) {
5243 // Get the vector length that will be multiplied by vnum
5244 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5245 DAG.getConstant(1, DL, MVT::i32));
5246
5247 // Multiply SVL and vnum then add it to the base
5248 SDValue Mul = DAG.getNode(
5249 ISD::MUL, DL, MVT::i64,
5250 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5251 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5252 // Just add vnum to the tileslice
5253 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5254 }
5255
5257 DL, MVT::Other,
5258 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5259 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5260}
5261
5262SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5263 SelectionDAG &DAG) const {
5264 unsigned IntNo = Op.getConstantOperandVal(1);
5265 SDLoc DL(Op);
5266 switch (IntNo) {
5267 default:
5268 return SDValue(); // Don't custom lower most intrinsics.
5269 case Intrinsic::aarch64_prefetch: {
5270 SDValue Chain = Op.getOperand(0);
5271 SDValue Addr = Op.getOperand(2);
5272
5273 unsigned IsWrite = Op.getConstantOperandVal(3);
5274 unsigned Locality = Op.getConstantOperandVal(4);
5275 unsigned IsStream = Op.getConstantOperandVal(5);
5276 unsigned IsData = Op.getConstantOperandVal(6);
5277 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5278 (!IsData << 3) | // IsDataCache bit
5279 (Locality << 1) | // Cache level bits
5280 (unsigned)IsStream; // Stream bit
5281
5282 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5283 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5284 }
5285 case Intrinsic::aarch64_sme_str:
5286 case Intrinsic::aarch64_sme_ldr: {
5287 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5288 }
5289 case Intrinsic::aarch64_sme_za_enable:
5290 return DAG.getNode(
5291 AArch64ISD::SMSTART, DL, MVT::Other,
5292 Op->getOperand(0), // Chain
5293 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5294 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5295 case Intrinsic::aarch64_sme_za_disable:
5296 return DAG.getNode(
5297 AArch64ISD::SMSTOP, DL, MVT::Other,
5298 Op->getOperand(0), // Chain
5299 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5300 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5301 }
5302}
5303
5304SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5305 SelectionDAG &DAG) const {
5306 unsigned IntNo = Op.getConstantOperandVal(1);
5307 SDLoc DL(Op);
5308 switch (IntNo) {
5309 default:
5310 return SDValue(); // Don't custom lower most intrinsics.
5311 case Intrinsic::aarch64_mops_memset_tag: {
5312 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5313 SDValue Chain = Node->getChain();
5314 SDValue Dst = Op.getOperand(2);
5315 SDValue Val = Op.getOperand(3);
5316 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5317 SDValue Size = Op.getOperand(4);
5318 auto Alignment = Node->getMemOperand()->getAlign();
5319 bool IsVol = Node->isVolatile();
5320 auto DstPtrInfo = Node->getPointerInfo();
5321
5322 const auto &SDI =
5323 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5324 SDValue MS =
5325 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5326 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5327
5328 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5329 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5330 // LowerOperationWrapper will complain that the number of results has
5331 // changed.
5332 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5333 }
5334 }
5335}
5336
5337SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5338 SelectionDAG &DAG) const {
5339 unsigned IntNo = Op.getConstantOperandVal(0);
5340 SDLoc dl(Op);
5341 switch (IntNo) {
5342 default: return SDValue(); // Don't custom lower most intrinsics.
5343 case Intrinsic::thread_pointer: {
5344 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5345 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5346 }
5347 case Intrinsic::aarch64_neon_abs: {
5348 EVT Ty = Op.getValueType();
5349 if (Ty == MVT::i64) {
5350 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5351 Op.getOperand(1));
5352 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5353 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5354 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5355 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5356 } else {
5357 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5358 }
5359 }
5360 case Intrinsic::aarch64_neon_pmull64: {
5361 SDValue LHS = Op.getOperand(1);
5362 SDValue RHS = Op.getOperand(2);
5363
5364 std::optional<uint64_t> LHSLane =
5366 std::optional<uint64_t> RHSLane =
5368
5369 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5370 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5371
5372 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5373 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5374 // which ISel recognizes better. For example, generate a ldr into d*
5375 // registers as opposed to a GPR load followed by a fmov.
5376 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5377 std::optional<uint64_t> OtherLane,
5378 const SDLoc &dl,
5379 SelectionDAG &DAG) -> SDValue {
5380 // If the operand is an higher half itself, rewrite it to
5381 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5382 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5383 if (NLane && *NLane == 1)
5384 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5385 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5386
5387 // Operand N is not a higher half but the other operand is.
5388 if (OtherLane && *OtherLane == 1) {
5389 // If this operand is a lower half, rewrite it to
5390 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5391 // align lanes of two operands. A roundtrip sequence (to move from lane
5392 // 1 to lane 0) is like this:
5393 // mov x8, v0.d[1]
5394 // fmov d0, x8
5395 if (NLane && *NLane == 0)
5396 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5397 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5398 N.getOperand(0),
5399 DAG.getConstant(0, dl, MVT::i64)),
5400 DAG.getConstant(1, dl, MVT::i64));
5401
5402 // Otherwise just dup from main to all lanes.
5403 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5404 }
5405
5406 // Neither operand is an extract of higher half, so codegen may just use
5407 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5408 assert(N.getValueType() == MVT::i64 &&
5409 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5410 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5411 };
5412
5413 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5414 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5415
5416 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5417 }
5418 case Intrinsic::aarch64_neon_smax:
5419 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5420 Op.getOperand(1), Op.getOperand(2));
5421 case Intrinsic::aarch64_neon_umax:
5422 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5423 Op.getOperand(1), Op.getOperand(2));
5424 case Intrinsic::aarch64_neon_smin:
5425 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5426 Op.getOperand(1), Op.getOperand(2));
5427 case Intrinsic::aarch64_neon_umin:
5428 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5429 Op.getOperand(1), Op.getOperand(2));
5430 case Intrinsic::aarch64_neon_scalar_sqxtn:
5431 case Intrinsic::aarch64_neon_scalar_sqxtun:
5432 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5433 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5434 if (Op.getValueType() == MVT::i32)
5435 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5436 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5437 Op.getOperand(0),
5438 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5439 Op.getOperand(1))));
5440 return SDValue();
5441 }
5442 case Intrinsic::aarch64_sve_whilelo:
5443 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5444 /*IsEqual=*/false);
5445 case Intrinsic::aarch64_sve_whilelt:
5446 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5447 /*IsEqual=*/false);
5448 case Intrinsic::aarch64_sve_whilels:
5449 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5450 /*IsEqual=*/true);
5451 case Intrinsic::aarch64_sve_whilele:
5452 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5453 /*IsEqual=*/true);
5454 case Intrinsic::aarch64_sve_sunpkhi:
5455 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5456 Op.getOperand(1));
5457 case Intrinsic::aarch64_sve_sunpklo:
5458 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5459 Op.getOperand(1));
5460 case Intrinsic::aarch64_sve_uunpkhi:
5461 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5462 Op.getOperand(1));
5463 case Intrinsic::aarch64_sve_uunpklo:
5464 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5465 Op.getOperand(1));
5466 case Intrinsic::aarch64_sve_clasta_n:
5467 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5468 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5469 case Intrinsic::aarch64_sve_clastb_n:
5470 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5471 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5472 case Intrinsic::aarch64_sve_lasta:
5473 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5474 Op.getOperand(1), Op.getOperand(2));
5475 case Intrinsic::aarch64_sve_lastb:
5476 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5477 Op.getOperand(1), Op.getOperand(2));
5478 case Intrinsic::aarch64_sve_rev:
5479 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5480 Op.getOperand(1));
5481 case Intrinsic::aarch64_sve_tbl:
5482 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5483 Op.getOperand(1), Op.getOperand(2));
5484 case Intrinsic::aarch64_sve_trn1:
5485 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5486 Op.getOperand(1), Op.getOperand(2));
5487 case Intrinsic::aarch64_sve_trn2:
5488 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5489 Op.getOperand(1), Op.getOperand(2));
5490 case Intrinsic::aarch64_sve_uzp1:
5491 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5492 Op.getOperand(1), Op.getOperand(2));
5493 case Intrinsic::aarch64_sve_uzp2:
5494 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5495 Op.getOperand(1), Op.getOperand(2));
5496 case Intrinsic::aarch64_sve_zip1:
5497 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5498 Op.getOperand(1), Op.getOperand(2));
5499 case Intrinsic::aarch64_sve_zip2:
5500 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5501 Op.getOperand(1), Op.getOperand(2));
5502 case Intrinsic::aarch64_sve_splice:
5503 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5504 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5505 case Intrinsic::aarch64_sve_ptrue:
5506 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5507 case Intrinsic::aarch64_sve_clz:
5508 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5509 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5510 case Intrinsic::aarch64_sme_cntsb:
5511 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5512 DAG.getConstant(1, dl, MVT::i32));
5513 case Intrinsic::aarch64_sme_cntsh: {
5514 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5515 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5516 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5517 }
5518 case Intrinsic::aarch64_sme_cntsw: {
5519 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5520 DAG.getConstant(1, dl, MVT::i32));
5521 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5522 DAG.getConstant(2, dl, MVT::i32));
5523 }
5524 case Intrinsic::aarch64_sme_cntsd: {
5525 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5526 DAG.getConstant(1, dl, MVT::i32));
5527 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5528 DAG.getConstant(3, dl, MVT::i32));
5529 }
5530 case Intrinsic::aarch64_sve_cnt: {
5531 SDValue Data = Op.getOperand(3);
5532 // CTPOP only supports integer operands.
5533 if (Data.getValueType().isFloatingPoint())
5534 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5535 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5536 Op.getOperand(2), Data, Op.getOperand(1));
5537 }
5538 case Intrinsic::aarch64_sve_dupq_lane:
5539 return LowerDUPQLane(Op, DAG);
5540 case Intrinsic::aarch64_sve_convert_from_svbool:
5541 if (Op.getValueType() == MVT::aarch64svcount)
5542 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5543 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5544 case Intrinsic::aarch64_sve_convert_to_svbool:
5545 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5546 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5547 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5548 case Intrinsic::aarch64_sve_fneg:
5549 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5550 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5551 case Intrinsic::aarch64_sve_frintp:
5552 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5553 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5554 case Intrinsic::aarch64_sve_frintm:
5555 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5556 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5557 case Intrinsic::aarch64_sve_frinti:
5558 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5559 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5560 case Intrinsic::aarch64_sve_frintx:
5561 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5562 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5563 case Intrinsic::aarch64_sve_frinta:
5564 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5566 case Intrinsic::aarch64_sve_frintn:
5567 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5568 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5569 case Intrinsic::aarch64_sve_frintz:
5570 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5571 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5572 case Intrinsic::aarch64_sve_ucvtf:
5574 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5575 Op.getOperand(1));
5576 case Intrinsic::aarch64_sve_scvtf:
5578 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5579 Op.getOperand(1));
5580 case Intrinsic::aarch64_sve_fcvtzu:
5582 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5583 Op.getOperand(1));
5584 case Intrinsic::aarch64_sve_fcvtzs:
5586 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5587 Op.getOperand(1));
5588 case Intrinsic::aarch64_sve_fsqrt:
5589 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5590 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5591 case Intrinsic::aarch64_sve_frecpx:
5592 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5593 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5594 case Intrinsic::aarch64_sve_frecpe_x:
5595 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5596 Op.getOperand(1));
5597 case Intrinsic::aarch64_sve_frecps_x:
5598 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5599 Op.getOperand(1), Op.getOperand(2));
5600 case Intrinsic::aarch64_sve_frsqrte_x:
5601 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5602 Op.getOperand(1));
5603 case Intrinsic::aarch64_sve_frsqrts_x:
5604 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5605 Op.getOperand(1), Op.getOperand(2));
5606 case Intrinsic::aarch64_sve_fabs:
5607 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5608 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5609 case Intrinsic::aarch64_sve_abs:
5610 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5611 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5612 case Intrinsic::aarch64_sve_neg:
5613 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5614 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5615 case Intrinsic::aarch64_sve_insr: {
5616 SDValue Scalar = Op.getOperand(2);
5617 EVT ScalarTy = Scalar.getValueType();
5618 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5619 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5620
5621 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5622 Op.getOperand(1), Scalar);
5623 }
5624 case Intrinsic::aarch64_sve_rbit:
5626 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5627 Op.getOperand(1));
5628 case Intrinsic::aarch64_sve_revb:
5629 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5630 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5631 case Intrinsic::aarch64_sve_revh:
5632 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5633 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5634 case Intrinsic::aarch64_sve_revw:
5635 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5636 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5637 case Intrinsic::aarch64_sve_revd:
5638 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5639 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5640 case Intrinsic::aarch64_sve_sxtb:
5641 return DAG.getNode(
5643 Op.getOperand(2), Op.getOperand(3),
5644 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5645 Op.getOperand(1));
5646 case Intrinsic::aarch64_sve_sxth:
5647 return DAG.getNode(
5649 Op.getOperand(2), Op.getOperand(3),
5650 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5651 Op.getOperand(1));
5652 case Intrinsic::aarch64_sve_sxtw:
5653 return DAG.getNode(
5655 Op.getOperand(2), Op.getOperand(3),
5656 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5657 Op.getOperand(1));
5658 case Intrinsic::aarch64_sve_uxtb:
5659 return DAG.getNode(
5661 Op.getOperand(2), Op.getOperand(3),
5662 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5663 Op.getOperand(1));
5664 case Intrinsic::aarch64_sve_uxth:
5665 return DAG.getNode(
5667 Op.getOperand(2), Op.getOperand(3),
5668 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5669 Op.getOperand(1));
5670 case Intrinsic::aarch64_sve_uxtw:
5671 return DAG.getNode(
5673 Op.getOperand(2), Op.getOperand(3),
5674 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5675 Op.getOperand(1));
5676 case Intrinsic::localaddress: {
5677 const auto &MF = DAG.getMachineFunction();
5678 const auto *RegInfo = Subtarget->getRegisterInfo();
5679 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5680 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5681 Op.getSimpleValueType());
5682 }
5683
5684 case Intrinsic::eh_recoverfp: {
5685 // FIXME: This needs to be implemented to correctly handle highly aligned
5686 // stack objects. For now we simply return the incoming FP. Refer D53541
5687 // for more details.
5688 SDValue FnOp = Op.getOperand(1);
5689 SDValue IncomingFPOp = Op.getOperand(2);
5690 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5691 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5692 if (!Fn)
5694 "llvm.eh.recoverfp must take a function as the first argument");
5695 return IncomingFPOp;
5696 }
5697
5698 case Intrinsic::aarch64_neon_vsri:
5699 case Intrinsic::aarch64_neon_vsli:
5700 case Intrinsic::aarch64_sve_sri:
5701 case Intrinsic::aarch64_sve_sli: {
5702 EVT Ty = Op.getValueType();
5703
5704 if (!Ty.isVector())
5705 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5706
5707 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5708
5709 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5710 IntNo == Intrinsic::aarch64_sve_sri;
5711 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5712 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5713 Op.getOperand(3));
5714 }
5715
5716 case Intrinsic::aarch64_neon_srhadd:
5717 case Intrinsic::aarch64_neon_urhadd:
5718 case Intrinsic::aarch64_neon_shadd:
5719 case Intrinsic::aarch64_neon_uhadd: {
5720 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5721 IntNo == Intrinsic::aarch64_neon_shadd);
5722 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5723 IntNo == Intrinsic::aarch64_neon_urhadd);
5724 unsigned Opcode = IsSignedAdd
5725 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5726 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5727 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5728 Op.getOperand(2));
5729 }
5730 case Intrinsic::aarch64_neon_saddlp:
5731 case Intrinsic::aarch64_neon_uaddlp: {
5732 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5735 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5736 }
5737 case Intrinsic::aarch64_neon_sdot:
5738 case Intrinsic::aarch64_neon_udot:
5739 case Intrinsic::aarch64_sve_sdot:
5740 case Intrinsic::aarch64_sve_udot: {
5741 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5742 IntNo == Intrinsic::aarch64_sve_udot)
5745 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5746 Op.getOperand(2), Op.getOperand(3));
5747 }
5748 case Intrinsic::get_active_lane_mask: {
5749 SDValue ID =
5750 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5751 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5752 Op.getOperand(1), Op.getOperand(2));
5753 }
5754 case Intrinsic::aarch64_neon_uaddlv: {
5755 EVT OpVT = Op.getOperand(1).getValueType();
5756 EVT ResVT = Op.getValueType();
5757 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5758 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5759 // In order to avoid insert_subvector, used v4i32 than v2i32.
5760 SDValue UADDLV =
5761 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5762 SDValue EXTRACT_VEC_ELT =
5763 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5764 DAG.getConstant(0, dl, MVT::i64));
5765 return EXTRACT_VEC_ELT;
5766 }
5767 return SDValue();
5768 }
5769 case Intrinsic::experimental_cttz_elts: {
5770 SDValue NewCttzElts =
5771 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5772
5773 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5774 }
5775 }
5776}
5777
5778bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5779 if (VT.getVectorElementType() == MVT::i8 ||
5780 VT.getVectorElementType() == MVT::i16) {
5781 EltTy = MVT::i32;
5782 return true;
5783 }
5784 return false;
5785}
5786
5787bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5788 EVT DataVT) const {
5789 const EVT IndexVT = Extend.getOperand(0).getValueType();
5790 // SVE only supports implicit extension of 32-bit indices.
5791 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5792 return false;
5793
5794 // Indices cannot be smaller than the main data type.
5795 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5796 return false;
5797
5798 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5799 // element container type, which would violate the previous clause.
5800 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5801}
5802
5803bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5804 EVT ExtVT = ExtVal.getValueType();
5805 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5806 return false;
5807
5808 // It may be worth creating extending masked loads if there are multiple
5809 // masked loads using the same predicate. That way we'll end up creating
5810 // extending masked loads that may then get split by the legaliser. This
5811 // results in just one set of predicate unpacks at the start, instead of
5812 // multiple sets of vector unpacks after each load.
5813 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5814 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5815 // Disable extending masked loads for fixed-width for now, since the code
5816 // quality doesn't look great.
5817 if (!ExtVT.isScalableVector())
5818 return false;
5819
5820 unsigned NumExtMaskedLoads = 0;
5821 for (auto *U : Ld->getMask()->uses())
5822 if (isa<MaskedLoadSDNode>(U))
5823 NumExtMaskedLoads++;
5824
5825 if (NumExtMaskedLoads <= 1)
5826 return false;
5827 }
5828 }
5829
5830 return true;
5831}
5832
5833unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5834 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5835 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5837 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5839 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5841 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5843 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5845 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5847 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5849 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5851 };
5852 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5853 return AddrModes.find(Key)->second;
5854}
5855
5856unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5857 switch (Opcode) {
5858 default:
5859 llvm_unreachable("unimplemented opcode");
5860 return Opcode;
5875 }
5876}
5877
5878SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5879 SelectionDAG &DAG) const {
5880 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5881
5882 SDLoc DL(Op);
5883 SDValue Chain = MGT->getChain();
5884 SDValue PassThru = MGT->getPassThru();
5885 SDValue Mask = MGT->getMask();
5886 SDValue BasePtr = MGT->getBasePtr();
5887 SDValue Index = MGT->getIndex();
5888 SDValue Scale = MGT->getScale();
5889 EVT VT = Op.getValueType();
5890 EVT MemVT = MGT->getMemoryVT();
5891 ISD::LoadExtType ExtType = MGT->getExtensionType();
5892 ISD::MemIndexType IndexType = MGT->getIndexType();
5893
5894 // SVE supports zero (and so undef) passthrough values only, everything else
5895 // must be handled manually by an explicit select on the load's output.
5896 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5897 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5898 SDValue Load =
5899 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5900 MGT->getMemOperand(), IndexType, ExtType);
5901 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5902 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5903 }
5904
5905 bool IsScaled = MGT->isIndexScaled();
5906 bool IsSigned = MGT->isIndexSigned();
5907
5908 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5909 // must be calculated before hand.
5910 uint64_t ScaleVal = Scale->getAsZExtVal();
5911 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5912 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5913 EVT IndexVT = Index.getValueType();
5914 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5915 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5916 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5917
5918 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5919 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5920 MGT->getMemOperand(), IndexType, ExtType);
5921 }
5922
5923 // Lower fixed length gather to a scalable equivalent.
5924 if (VT.isFixedLengthVector()) {
5925 assert(Subtarget->useSVEForFixedLengthVectors() &&
5926 "Cannot lower when not using SVE for fixed vectors!");
5927
5928 // NOTE: Handle floating-point as if integer then bitcast the result.
5930 MemVT = MemVT.changeVectorElementTypeToInteger();
5931
5932 // Find the smallest integer fixed length vector we can use for the gather.
5933 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5934 if (DataVT.getVectorElementType() == MVT::i64 ||
5935 Index.getValueType().getVectorElementType() == MVT::i64 ||
5936 Mask.getValueType().getVectorElementType() == MVT::i64)
5937 PromotedVT = VT.changeVectorElementType(MVT::i64);
5938
5939 // Promote vector operands except for passthrough, which we know is either
5940 // undef or zero, and thus best constructed directly.
5941 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5942 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5943 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5944
5945 // A promoted result type forces the need for an extending load.
5946 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5947 ExtType = ISD::EXTLOAD;
5948
5949 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5950
5951 // Convert fixed length vector operands to scalable.
5952 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5953 Index = convertToScalableVector(DAG, ContainerVT, Index);
5955 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5956 : DAG.getConstant(0, DL, ContainerVT);
5957
5958 // Emit equivalent scalable vector gather.
5959 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5960 SDValue Load =
5961 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5962 Ops, MGT->getMemOperand(), IndexType, ExtType);
5963
5964 // Extract fixed length data then convert to the required result type.
5965 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5966 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5967 if (VT.isFloatingPoint())
5968 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5969
5970 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5971 }
5972
5973 // Everything else is legal.
5974 return Op;
5975}
5976
5977SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5978 SelectionDAG &DAG) const {
5979 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5980
5981 SDLoc DL(Op);
5982 SDValue Chain = MSC->getChain();
5983 SDValue StoreVal = MSC->getValue();
5984 SDValue Mask = MSC->getMask();
5985 SDValue BasePtr = MSC->getBasePtr();
5986 SDValue Index = MSC->getIndex();
5987 SDValue Scale = MSC->getScale();
5988 EVT VT = StoreVal.getValueType();
5989 EVT MemVT = MSC->getMemoryVT();
5990 ISD::MemIndexType IndexType = MSC->getIndexType();
5991 bool Truncating = MSC->isTruncatingStore();
5992
5993 bool IsScaled = MSC->isIndexScaled();
5994 bool IsSigned = MSC->isIndexSigned();
5995
5996 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5997 // must be calculated before hand.
5998 uint64_t ScaleVal = Scale->getAsZExtVal();
5999 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6000 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6001 EVT IndexVT = Index.getValueType();
6002 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6003 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6004 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6005
6006 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6007 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6008 MSC->getMemOperand(), IndexType, Truncating);
6009 }
6010
6011 // Lower fixed length scatter to a scalable equivalent.
6012 if (VT.isFixedLengthVector()) {
6013 assert(Subtarget->useSVEForFixedLengthVectors() &&
6014 "Cannot lower when not using SVE for fixed vectors!");
6015
6016 // Once bitcast we treat floating-point scatters as if integer.
6017 if (VT.isFloatingPoint()) {
6019 MemVT = MemVT.changeVectorElementTypeToInteger();
6020 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6021 }
6022
6023 // Find the smallest integer fixed length vector we can use for the scatter.
6024 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6025 if (VT.getVectorElementType() == MVT::i64 ||
6026 Index.getValueType().getVectorElementType() == MVT::i64 ||
6027 Mask.getValueType().getVectorElementType() == MVT::i64)
6028 PromotedVT = VT.changeVectorElementType(MVT::i64);
6029
6030 // Promote vector operands.
6031 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6032 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6033 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6034 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6035
6036 // A promoted value type forces the need for a truncating store.
6037 if (PromotedVT != VT)
6038 Truncating = true;
6039
6040 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6041
6042 // Convert fixed length vector operands to scalable.
6043 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6044 Index = convertToScalableVector(DAG, ContainerVT, Index);
6046 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6047
6048 // Emit equivalent scalable vector scatter.
6049 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6050 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6051 MSC->getMemOperand(), IndexType, Truncating);
6052 }
6053
6054 // Everything else is legal.
6055 return Op;
6056}
6057
6058SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6059 SDLoc DL(Op);
6060 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6061 assert(LoadNode && "Expected custom lowering of a masked load node");
6062 EVT VT = Op->getValueType(0);
6063
6064 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6065 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6066
6067 SDValue PassThru = LoadNode->getPassThru();
6068 SDValue Mask = LoadNode->getMask();
6069
6070 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6071 return Op;
6072
6074 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6075 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6076 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6077 LoadNode->getExtensionType());
6078
6079 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6080
6081 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6082}
6083
6084// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6086 EVT VT, EVT MemVT,
6087 SelectionDAG &DAG) {
6088 assert(VT.isVector() && "VT should be a vector type");
6089 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6090
6091 SDValue Value = ST->getValue();
6092
6093 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6094 // the word lane which represent the v4i8 subvector. It optimizes the store
6095 // to:
6096 //
6097 // xtn v0.8b, v0.8h
6098 // str s0, [x0]
6099
6100 SDValue Undef = DAG.getUNDEF(MVT::i16);
6101 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6102 {Undef, Undef, Undef, Undef});
6103
6104 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6105 Value, UndefVec);
6106 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6107
6108 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6109 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6110 Trunc, DAG.getConstant(0, DL, MVT::i64));
6111
6112 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6113 ST->getBasePtr(), ST->getMemOperand());
6114}
6115
6116// Custom lowering for any store, vector or scalar and/or default or with
6117// a truncate operations. Currently only custom lower truncate operation
6118// from vector v4i16 to v4i8 or volatile stores of i128.
6119SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6120 SelectionDAG &DAG) const {
6121 SDLoc Dl(Op);
6122 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6123 assert (StoreNode && "Can only custom lower store nodes");
6124
6125 SDValue Value = StoreNode->getValue();
6126
6127 EVT VT = Value.getValueType();
6128 EVT MemVT = StoreNode->getMemoryVT();
6129
6130 if (VT.isVector()) {
6132 VT,
6133 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6134 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6135
6136 unsigned AS = StoreNode->getAddressSpace();
6137 Align Alignment = StoreNode->getAlign();
6138 if (Alignment < MemVT.getStoreSize() &&
6139 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6140 StoreNode->getMemOperand()->getFlags(),
6141 nullptr)) {
6142 return scalarizeVectorStore(StoreNode, DAG);
6143 }
6144
6145 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6146 MemVT == MVT::v4i8) {
6147 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6148 }
6149 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6150 // the custom lowering, as there are no un-paired non-temporal stores and
6151 // legalization will break up 256 bit inputs.
6153 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6154 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6155 (MemVT.getScalarSizeInBits() == 8u ||
6156 MemVT.getScalarSizeInBits() == 16u ||
6157 MemVT.getScalarSizeInBits() == 32u ||
6158 MemVT.getScalarSizeInBits() == 64u)) {
6159 SDValue Lo =
6162 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6163 SDValue Hi =
6166 StoreNode->getValue(),
6167 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6169 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6170 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6171 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6172 return Result;
6173 }
6174 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6175 return LowerStore128(Op, DAG);
6176 } else if (MemVT == MVT::i64x8) {
6177 SDValue Value = StoreNode->getValue();
6178 assert(Value->getValueType(0) == MVT::i64x8);
6179 SDValue Chain = StoreNode->getChain();
6180 SDValue Base = StoreNode->getBasePtr();
6181 EVT PtrVT = Base.getValueType();
6182 for (unsigned i = 0; i < 8; i++) {
6183 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6184 Value, DAG.getConstant(i, Dl, MVT::i32));
6185 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6186 DAG.getConstant(i * 8, Dl, PtrVT));
6187 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6188 StoreNode->getOriginalAlign());
6189 }
6190 return Chain;
6191 }
6192
6193 return SDValue();
6194}
6195
6196/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6197SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6198 SelectionDAG &DAG) const {
6199 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6200 assert(StoreNode->getMemoryVT() == MVT::i128);
6201 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6202
6203 bool IsStoreRelease =
6205 if (StoreNode->isAtomic())
6206 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6207 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6210
6211 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6212 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6213 ? StoreNode->getOperand(1)
6214 : StoreNode->getOperand(2);
6215 SDLoc DL(Op);
6216 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6217 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6218 if (DAG.getDataLayout().isBigEndian())
6219 std::swap(StoreValue.first, StoreValue.second);
6221 Opcode, DL, DAG.getVTList(MVT::Other),
6222 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6223 StoreNode->getBasePtr()},
6224 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6225 return Result;
6226}
6227
6228SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6229 SelectionDAG &DAG) const {
6230 SDLoc DL(Op);
6231 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6232 assert(LoadNode && "Expected custom lowering of a load node");
6233
6234 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6236 SDValue Base = LoadNode->getBasePtr();
6237 SDValue Chain = LoadNode->getChain();
6238 EVT PtrVT = Base.getValueType();
6239 for (unsigned i = 0; i < 8; i++) {
6240 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6241 DAG.getConstant(i * 8, DL, PtrVT));
6242 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6243 LoadNode->getPointerInfo(),
6244 LoadNode->getOriginalAlign());
6245 Ops.push_back(Part);
6246 Chain = SDValue(Part.getNode(), 1);
6247 }
6248 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6249 return DAG.getMergeValues({Loaded, Chain}, DL);
6250 }
6251
6252 // Custom lowering for extending v4i8 vector loads.
6253 EVT VT = Op->getValueType(0);
6254 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6255
6256 if (LoadNode->getMemoryVT() != MVT::v4i8)
6257 return SDValue();
6258
6259 unsigned ExtType;
6260 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6261 ExtType = ISD::SIGN_EXTEND;
6262 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6263 LoadNode->getExtensionType() == ISD::EXTLOAD)
6264 ExtType = ISD::ZERO_EXTEND;
6265 else
6266 return SDValue();
6267
6268 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6269 LoadNode->getBasePtr(), MachinePointerInfo());
6270 SDValue Chain = Load.getValue(1);
6271 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6272 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6273 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6274 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6275 DAG.getConstant(0, DL, MVT::i64));
6276 if (VT == MVT::v4i32)
6277 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6278 return DAG.getMergeValues({Ext, Chain}, DL);
6279}
6280
6281// Generate SUBS and CSEL for integer abs.
6282SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6283 MVT VT = Op.getSimpleValueType();
6284
6285 if (VT.isVector())
6286 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6287
6288 SDLoc DL(Op);
6289 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6290 Op.getOperand(0));
6291 // Generate SUBS & CSEL.
6292 SDValue Cmp =
6293 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6294 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6295 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6296 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6297 Cmp.getValue(1));
6298}
6299
6301 SDValue Chain = Op.getOperand(0);
6302 SDValue Cond = Op.getOperand(1);
6303 SDValue Dest = Op.getOperand(2);
6304
6306 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6307 SDLoc dl(Op);
6308 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6309 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6310 Cmp);
6311 }
6312
6313 return SDValue();
6314}
6315
6316// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6317// FSHL is converted to FSHR before deciding what to do with it
6319 SDValue Shifts = Op.getOperand(2);
6320 // Check if the shift amount is a constant
6321 // If opcode is FSHL, convert it to FSHR
6322 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6323 SDLoc DL(Op);
6324 MVT VT = Op.getSimpleValueType();
6325
6326 if (Op.getOpcode() == ISD::FSHL) {
6327 unsigned int NewShiftNo =
6328 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6329 return DAG.getNode(
6330 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6331 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6332 } else if (Op.getOpcode() == ISD::FSHR) {
6333 return Op;
6334 }
6335 }
6336
6337 return SDValue();
6338}
6339
6341 SDValue X = Op.getOperand(0);
6342 EVT XScalarTy = X.getValueType();
6343 SDValue Exp = Op.getOperand(1);
6344
6345 SDLoc DL(Op);
6346 EVT XVT, ExpVT;
6347 switch (Op.getSimpleValueType().SimpleTy) {
6348 default:
6349 return SDValue();
6350 case MVT::bf16:
6351 case MVT::f16:
6352 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6353 [[fallthrough]];
6354 case MVT::f32:
6355 XVT = MVT::nxv4f32;
6356 ExpVT = MVT::nxv4i32;
6357 break;
6358 case MVT::f64:
6359 XVT = MVT::nxv2f64;
6360 ExpVT = MVT::nxv2i64;
6361 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6362 break;
6363 }
6364
6365 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6366 SDValue VX =
6367 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6368 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6369 DAG.getUNDEF(ExpVT), Exp, Zero);
6370 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6371 AArch64SVEPredPattern::all);
6372 SDValue FScale =
6374 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6375 VPg, VX, VExp);
6376 SDValue Final =
6377 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6378 if (X.getValueType() != XScalarTy)
6379 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6380 DAG.getIntPtrConstant(1, SDLoc(Op)));
6381 return Final;
6382}
6383
6385 SelectionDAG &DAG) const {
6386 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6387 LLVM_DEBUG(Op.dump());
6388
6389 switch (Op.getOpcode()) {
6390 default:
6391 llvm_unreachable("unimplemented operand");
6392 return SDValue();
6393 case ISD::BITCAST:
6394 return LowerBITCAST(Op, DAG);
6395 case ISD::GlobalAddress:
6396 return LowerGlobalAddress(Op, DAG);
6398 return LowerGlobalTLSAddress(Op, DAG);
6399 case ISD::SETCC:
6400 case ISD::STRICT_FSETCC:
6402 return LowerSETCC(Op, DAG);
6403 case ISD::SETCCCARRY:
6404 return LowerSETCCCARRY(Op, DAG);
6405 case ISD::BRCOND:
6406 return LowerBRCOND(Op, DAG);
6407 case ISD::BR_CC:
6408 return LowerBR_CC(Op, DAG);
6409 case ISD::SELECT:
6410 return LowerSELECT(Op, DAG);
6411 case ISD::SELECT_CC:
6412 return LowerSELECT_CC(Op, DAG);
6413 case ISD::JumpTable:
6414 return LowerJumpTable(Op, DAG);
6415 case ISD::BR_JT:
6416 return LowerBR_JT(Op, DAG);
6417 case ISD::ConstantPool:
6418 return LowerConstantPool(Op, DAG);
6419 case ISD::BlockAddress:
6420 return LowerBlockAddress(Op, DAG);
6421 case ISD::VASTART:
6422 return LowerVASTART(Op, DAG);
6423 case ISD::VACOPY:
6424 return LowerVACOPY(Op, DAG);
6425 case ISD::VAARG:
6426 return LowerVAARG(Op, DAG);
6427 case ISD::UADDO_CARRY:
6428 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6429 case ISD::USUBO_CARRY:
6430 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6431 case ISD::SADDO_CARRY:
6432 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6433 case ISD::SSUBO_CARRY:
6434 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6435 case ISD::SADDO:
6436 case ISD::UADDO:
6437 case ISD::SSUBO:
6438 case ISD::USUBO:
6439 case ISD::SMULO:
6440 case ISD::UMULO:
6441 return LowerXALUO(Op, DAG);
6442 case ISD::FADD:
6443 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6444 case ISD::FSUB:
6445 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6446 case ISD::FMUL:
6447 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6448 case ISD::FMA:
6449 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6450 case ISD::FDIV:
6451 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6452 case ISD::FNEG:
6453 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6454 case ISD::FCEIL:
6455 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6456 case ISD::FFLOOR:
6457 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6458 case ISD::FNEARBYINT:
6459 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6460 case ISD::FRINT:
6461 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6462 case ISD::FROUND:
6463 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6464 case ISD::FROUNDEVEN:
6465 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6466 case ISD::FTRUNC:
6467 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6468 case ISD::FSQRT:
6469 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6470 case ISD::FABS:
6471 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6472 case ISD::FP_ROUND:
6474 return LowerFP_ROUND(Op, DAG);
6475 case ISD::FP_EXTEND:
6476 return LowerFP_EXTEND(Op, DAG);
6477 case ISD::FRAMEADDR:
6478 return LowerFRAMEADDR(Op, DAG);
6479 case ISD::SPONENTRY:
6480 return LowerSPONENTRY(Op, DAG);
6481 case ISD::RETURNADDR:
6482 return LowerRETURNADDR(Op, DAG);
6484 return LowerADDROFRETURNADDR(Op, DAG);
6486 return LowerCONCAT_VECTORS(Op, DAG);
6488 return LowerINSERT_VECTOR_ELT(Op, DAG);
6490 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6491 case ISD::BUILD_VECTOR:
6492 return LowerBUILD_VECTOR(Op, DAG);
6494 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6496 return LowerVECTOR_SHUFFLE(Op, DAG);
6497 case ISD::SPLAT_VECTOR:
6498 return LowerSPLAT_VECTOR(Op, DAG);
6500 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6502 return LowerINSERT_SUBVECTOR(Op, DAG);
6503 case ISD::SDIV:
6504 case ISD::UDIV:
6505 return LowerDIV(Op, DAG);
6506 case ISD::SMIN:
6507 case ISD::UMIN:
6508 case ISD::SMAX:
6509 case ISD::UMAX:
6510 return LowerMinMax(Op, DAG);
6511 case ISD::SRA:
6512 case ISD::SRL:
6513 case ISD::SHL:
6514 return LowerVectorSRA_SRL_SHL(Op, DAG);
6515 case ISD::SHL_PARTS:
6516 case ISD::SRL_PARTS:
6517 case ISD::SRA_PARTS:
6518 return LowerShiftParts(Op, DAG);
6519 case ISD::CTPOP:
6520 case ISD::PARITY:
6521 return LowerCTPOP_PARITY(Op, DAG);
6522 case ISD::FCOPYSIGN:
6523 return LowerFCOPYSIGN(Op, DAG);
6524 case ISD::OR:
6525 return LowerVectorOR(Op, DAG);
6526 case ISD::XOR:
6527 return LowerXOR(Op, DAG);
6528 case ISD::PREFETCH:
6529 return LowerPREFETCH(Op, DAG);
6530 case ISD::SINT_TO_FP:
6531 case ISD::UINT_TO_FP:
6534 return LowerINT_TO_FP(Op, DAG);
6535 case ISD::FP_TO_SINT:
6536 case ISD::FP_TO_UINT:
6539 return LowerFP_TO_INT(Op, DAG);
6542 return LowerFP_TO_INT_SAT(Op, DAG);
6543 case ISD::FSINCOS:
6544 return LowerFSINCOS(Op, DAG);
6545 case ISD::GET_ROUNDING:
6546 return LowerGET_ROUNDING(Op, DAG);
6547 case ISD::SET_ROUNDING:
6548 return LowerSET_ROUNDING(Op, DAG);
6549 case ISD::GET_FPMODE:
6550 return LowerGET_FPMODE(Op, DAG);
6551 case ISD::SET_FPMODE:
6552 return LowerSET_FPMODE(Op, DAG);
6553 case ISD::RESET_FPMODE:
6554 return LowerRESET_FPMODE(Op, DAG);
6555 case ISD::MUL:
6556 return LowerMUL(Op, DAG);
6557 case ISD::MULHS:
6558 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6559 case ISD::MULHU:
6560 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6562 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6564 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6566 return LowerINTRINSIC_VOID(Op, DAG);
6567 case ISD::ATOMIC_STORE:
6568 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6569 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6570 return LowerStore128(Op, DAG);
6571 }
6572 return SDValue();
6573 case ISD::STORE:
6574 return LowerSTORE(Op, DAG);
6575 case ISD::MSTORE:
6576 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6577 case ISD::MGATHER:
6578 return LowerMGATHER(Op, DAG);
6579 case ISD::MSCATTER:
6580 return LowerMSCATTER(Op, DAG);
6582 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6583 case ISD::VECREDUCE_ADD:
6584 case ISD::VECREDUCE_AND:
6585 case ISD::VECREDUCE_OR:
6586 case ISD::VECREDUCE_XOR:
6596 return LowerVECREDUCE(Op, DAG);
6598 return LowerATOMIC_LOAD_AND(Op, DAG);
6600 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6601 case ISD::VSCALE:
6602 return LowerVSCALE(Op, DAG);
6603 case ISD::ANY_EXTEND:
6604 case ISD::SIGN_EXTEND:
6605 case ISD::ZERO_EXTEND:
6606 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6608 // Only custom lower when ExtraVT has a legal byte based element type.
6609 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6610 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6611 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6612 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6613 return SDValue();
6614
6615 return LowerToPredicatedOp(Op, DAG,
6617 }
6618 case ISD::TRUNCATE:
6619 return LowerTRUNCATE(Op, DAG);
6620 case ISD::MLOAD:
6621 return LowerMLOAD(Op, DAG);
6622 case ISD::LOAD:
6623 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6624 !Subtarget->isNeonAvailable()))
6625 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6626 return LowerLOAD(Op, DAG);
6627 case ISD::ADD:
6628 case ISD::AND:
6629 case ISD::SUB:
6630 return LowerToScalableOp(Op, DAG);
6631 case ISD::FMAXIMUM:
6632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6633 case ISD::FMAXNUM:
6634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6635 case ISD::FMINIMUM:
6636 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6637 case ISD::FMINNUM:
6638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6639 case ISD::VSELECT:
6640 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6641 case ISD::ABS:
6642 return LowerABS(Op, DAG);
6643 case ISD::ABDS:
6644 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6645 case ISD::ABDU:
6646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6647 case ISD::AVGFLOORS:
6648 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6649 case ISD::AVGFLOORU:
6650 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6651 case ISD::AVGCEILS:
6652 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6653 case ISD::AVGCEILU:
6654 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6655 case ISD::BITREVERSE:
6656 return LowerBitreverse(Op, DAG);
6657 case ISD::BSWAP:
6658 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6659 case ISD::CTLZ:
6660 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6661 case ISD::CTTZ:
6662 return LowerCTTZ(Op, DAG);
6663 case ISD::VECTOR_SPLICE:
6664 return LowerVECTOR_SPLICE(Op, DAG);
6666 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6668 return LowerVECTOR_INTERLEAVE(Op, DAG);
6669 case ISD::LROUND:
6670 case ISD::LLROUND:
6671 case ISD::LRINT:
6672 case ISD::LLRINT: {
6673 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6674 Op.getOperand(0).getValueType() == MVT::bf16) &&
6675 "Expected custom lowering of rounding operations only for f16");
6676 SDLoc DL(Op);
6677 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6678 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6679 }
6680 case ISD::STRICT_LROUND:
6682 case ISD::STRICT_LRINT:
6683 case ISD::STRICT_LLRINT: {
6684 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6685 Op.getOperand(1).getValueType() == MVT::bf16) &&
6686 "Expected custom lowering of rounding operations only for f16");
6687 SDLoc DL(Op);
6688 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6689 {Op.getOperand(0), Op.getOperand(1)});
6690 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6691 {Ext.getValue(1), Ext.getValue(0)});
6692 }
6693 case ISD::WRITE_REGISTER: {
6694 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6695 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6696 SDLoc DL(Op);
6697
6698 SDValue Chain = Op.getOperand(0);
6699 SDValue SysRegName = Op.getOperand(1);
6700 std::pair<SDValue, SDValue> Pair =
6701 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6702
6703 // chain = MSRR(chain, sysregname, lo, hi)
6704 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6705 SysRegName, Pair.first, Pair.second);
6706
6707 return Result;
6708 }
6709 case ISD::FSHL:
6710 case ISD::FSHR:
6711 return LowerFunnelShift(Op, DAG);
6712 case ISD::FLDEXP:
6713 return LowerFLDEXP(Op, DAG);
6714 }
6715}
6716
6718 return !Subtarget->useSVEForFixedLengthVectors();
6719}
6720
6722 EVT VT, bool OverrideNEON) const {
6723 if (!VT.isFixedLengthVector() || !VT.isSimple())
6724 return false;
6725
6726 // Don't use SVE for vectors we cannot scalarize if required.
6727 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6728 // Fixed length predicates should be promoted to i8.
6729 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6730 case MVT::i1:
6731 default:
6732 return false;
6733 case MVT::i8:
6734 case MVT::i16:
6735 case MVT::i32:
6736 case MVT::i64:
6737 case MVT::f16:
6738 case MVT::f32:
6739 case MVT::f64:
6740 break;
6741 }
6742
6743 // NEON-sized vectors can be emulated using SVE instructions.
6744 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6745 return Subtarget->hasSVEorSME();
6746
6747 // Ensure NEON MVTs only belong to a single register class.
6748 if (VT.getFixedSizeInBits() <= 128)
6749 return false;
6750
6751 // Ensure wider than NEON code generation is enabled.
6752 if (!Subtarget->useSVEForFixedLengthVectors())
6753 return false;
6754
6755 // Don't use SVE for types that don't fit.
6756 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6757 return false;
6758
6759 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6760 // the base fixed length SVE support in place.
6761 if (!VT.isPow2VectorType())
6762 return false;
6763
6764 return true;
6765}
6766
6767//===----------------------------------------------------------------------===//
6768// Calling Convention Implementation
6769//===----------------------------------------------------------------------===//
6770
6771static unsigned getIntrinsicID(const SDNode *N) {
6772 unsigned Opcode = N->getOpcode();
6773 switch (Opcode) {
6774 default:
6777 unsigned IID = N->getConstantOperandVal(0);
6778 if (IID < Intrinsic::num_intrinsics)
6779 return IID;
6781 }
6782 }
6783}
6784
6786 SDValue N1) const {
6787 if (!N0.hasOneUse())
6788 return false;
6789
6790 unsigned IID = getIntrinsicID(N1.getNode());
6791 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6792 if (IID == Intrinsic::aarch64_neon_umull ||
6793 N1.getOpcode() == AArch64ISD::UMULL ||
6794 IID == Intrinsic::aarch64_neon_smull ||
6796 return N0.getOpcode() != ISD::ADD;
6797
6798 return true;
6799}
6800
6801/// Selects the correct CCAssignFn for a given CallingConvention value.
6803 bool IsVarArg) const {
6804 switch (CC) {
6805 default:
6806 report_fatal_error("Unsupported calling convention.");
6807 case CallingConv::GHC:
6808 return CC_AArch64_GHC;
6809 case CallingConv::C:
6810 case CallingConv::Fast:
6814 case CallingConv::Swift:
6816 case CallingConv::Tail:
6817 case CallingConv::GRAAL:
6818 if (Subtarget->isTargetWindows()) {
6819 if (IsVarArg) {
6820 if (Subtarget->isWindowsArm64EC())
6823 }
6824 return CC_AArch64_Win64PCS;
6825 }
6826 if (!Subtarget->isTargetDarwin())
6827 return CC_AArch64_AAPCS;
6828 if (!IsVarArg)
6829 return CC_AArch64_DarwinPCS;
6832 case CallingConv::Win64:
6833 if (IsVarArg) {
6834 if (Subtarget->isWindowsArm64EC())
6837 }
6838 return CC_AArch64_Win64PCS;
6840 if (Subtarget->isWindowsArm64EC())
6847 return CC_AArch64_AAPCS;
6852 }
6853}
6854
6855CCAssignFn *
6857 switch (CC) {
6858 default:
6859 return RetCC_AArch64_AAPCS;
6863 if (Subtarget->isWindowsArm64EC())
6865 return RetCC_AArch64_AAPCS;
6866 }
6867}
6868
6869
6870unsigned
6871AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6872 SelectionDAG &DAG) const {
6874 MachineFrameInfo &MFI = MF.getFrameInfo();
6875
6876 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6877 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6878 DAG.getConstant(1, DL, MVT::i32));
6879 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6880 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6881 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6882 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6883 Chain = Buffer.getValue(1);
6884 MFI.CreateVariableSizedObject(Align(1), nullptr);
6885
6886 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6887 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6888
6889 // Store the buffer pointer to the TPIDR2 stack object.
6892 TPIDR2Obj,
6894 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6895
6896 // Set the reserved bytes (10-15) to zero
6897 EVT PtrTy = Ptr.getValueType();
6898 SDValue ReservedPtr =
6899 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6900 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6901 MPI);
6902 ReservedPtr =
6903 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6904 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6905 MPI);
6906
6907 return TPIDR2Obj;
6908}
6909
6910static bool isPassedInFPR(EVT VT) {
6911 return VT.isFixedLengthVector() ||
6912 (VT.isFloatingPoint() && !VT.isScalableVector());
6913}
6914
6915SDValue AArch64TargetLowering::LowerFormalArguments(
6916 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6917 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6918 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6920 const Function &F = MF.getFunction();
6921 MachineFrameInfo &MFI = MF.getFrameInfo();
6922 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6923 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6924 (isVarArg && Subtarget->isWindowsArm64EC());
6926
6928 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6930 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6931 FuncInfo->setIsSVECC(true);
6932
6933 // Assign locations to all of the incoming arguments.
6935 DenseMap<unsigned, SDValue> CopiedRegs;
6936 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6937
6938 // At this point, Ins[].VT may already be promoted to i32. To correctly
6939 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6940 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6941 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6942 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6943 // LocVT.
6944 unsigned NumArgs = Ins.size();
6945 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6946 unsigned CurArgIdx = 0;
6947 for (unsigned i = 0; i != NumArgs; ++i) {
6948 MVT ValVT = Ins[i].VT;
6949 if (Ins[i].isOrigArg()) {
6950 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6951 CurArgIdx = Ins[i].getOrigArgIndex();
6952
6953 // Get type of the original argument.
6954 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6955 /*AllowUnknown*/ true);
6956 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6957 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6958 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6959 ValVT = MVT::i8;
6960 else if (ActualMVT == MVT::i16)
6961 ValVT = MVT::i16;
6962 }
6963 bool UseVarArgCC = false;
6964 if (IsWin64)
6965 UseVarArgCC = isVarArg;
6966 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6967 bool Res =
6968 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6969 assert(!Res && "Call operand has unhandled type");
6970 (void)Res;
6971 }
6972
6974 bool IsLocallyStreaming =
6975 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6976 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6977 SDValue Glue = Chain.getValue(1);
6978
6979 SmallVector<SDValue, 16> ArgValues;
6980 unsigned ExtraArgLocs = 0;
6981 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6982 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6983
6984 if (Ins[i].Flags.isByVal()) {
6985 // Byval is used for HFAs in the PCS, but the system should work in a
6986 // non-compliant manner for larger structs.
6987 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6988 int Size = Ins[i].Flags.getByValSize();
6989 unsigned NumRegs = (Size + 7) / 8;
6990
6991 // FIXME: This works on big-endian for composite byvals, which are the common
6992 // case. It should also work for fundamental types too.
6993 unsigned FrameIdx =
6994 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6995 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6996 InVals.push_back(FrameIdxN);
6997
6998 continue;
6999 }
7000
7001 if (Ins[i].Flags.isSwiftAsync())
7003
7004 SDValue ArgValue;
7005 if (VA.isRegLoc()) {
7006 // Arguments stored in registers.
7007 EVT RegVT = VA.getLocVT();
7008 const TargetRegisterClass *RC;
7009
7010 if (RegVT == MVT::i32)
7011 RC = &AArch64::GPR32RegClass;
7012 else if (RegVT == MVT::i64)
7013 RC = &AArch64::GPR64RegClass;
7014 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7015 RC = &AArch64::FPR16RegClass;
7016 else if (RegVT == MVT::f32)
7017 RC = &AArch64::FPR32RegClass;
7018 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7019 RC = &AArch64::FPR64RegClass;
7020 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7021 RC = &AArch64::FPR128RegClass;
7022 else if (RegVT.isScalableVector() &&
7023 RegVT.getVectorElementType() == MVT::i1) {
7024 FuncInfo->setIsSVECC(true);
7025 RC = &AArch64::PPRRegClass;
7026 } else if (RegVT == MVT::aarch64svcount) {
7027 FuncInfo->setIsSVECC(true);
7028 RC = &AArch64::PPRRegClass;
7029 } else if (RegVT.isScalableVector()) {
7030 FuncInfo->setIsSVECC(true);
7031 RC = &AArch64::ZPRRegClass;
7032 } else
7033 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7034
7035 // Transform the arguments in physical registers into virtual ones.
7036 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7037
7038 if (IsLocallyStreaming) {
7039 // LocallyStreamingFunctions must insert the SMSTART in the correct
7040 // position, so we use Glue to ensure no instructions can be scheduled
7041 // between the chain of:
7042 // t0: ch,glue = EntryNode
7043 // t1: res,ch,glue = CopyFromReg
7044 // ...
7045 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7046 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7047 // ^^^^^^
7048 // This will be the new Chain/Root node.
7049 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7050 Glue = ArgValue.getValue(2);
7051 if (isPassedInFPR(ArgValue.getValueType())) {
7052 ArgValue =
7054 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7055 {ArgValue, Glue});
7056 Glue = ArgValue.getValue(1);
7057 }
7058 } else
7059 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7060
7061 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7062 // to 64 bits. Insert an assert[sz]ext to capture this, then
7063 // truncate to the right size.
7064 switch (VA.getLocInfo()) {
7065 default:
7066 llvm_unreachable("Unknown loc info!");
7067 case CCValAssign::Full:
7068 break;
7070 assert(
7071 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7072 "Indirect arguments should be scalable on most subtargets");
7073 break;
7074 case CCValAssign::BCvt:
7075 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7076 break;
7077 case CCValAssign::AExt:
7078 case CCValAssign::SExt:
7079 case CCValAssign::ZExt:
7080 break;
7082 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7083 DAG.getConstant(32, DL, RegVT));
7084 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7085 break;
7086 }
7087 } else { // VA.isRegLoc()
7088 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7089 unsigned ArgOffset = VA.getLocMemOffset();
7090 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7091 ? VA.getLocVT().getSizeInBits()
7092 : VA.getValVT().getSizeInBits()) / 8;
7093
7094 uint32_t BEAlign = 0;
7095 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7096 !Ins[i].Flags.isInConsecutiveRegs())
7097 BEAlign = 8 - ArgSize;
7098
7099 SDValue FIN;
7100 MachinePointerInfo PtrInfo;
7101 if (StackViaX4) {
7102 // In both the ARM64EC varargs convention and the thunk convention,
7103 // arguments on the stack are accessed relative to x4, not sp. In
7104 // the thunk convention, there's an additional offset of 32 bytes
7105 // to account for the shadow store.
7106 unsigned ObjOffset = ArgOffset + BEAlign;
7107 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7108 ObjOffset += 32;
7109 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7110 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7111 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7112 DAG.getConstant(ObjOffset, DL, MVT::i64));
7114 } else {
7115 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7116
7117 // Create load nodes to retrieve arguments from the stack.
7118 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7119 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7120 }
7121
7122 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7124 MVT MemVT = VA.getValVT();
7125
7126 switch (VA.getLocInfo()) {
7127 default:
7128 break;
7129 case CCValAssign::Trunc:
7130 case CCValAssign::BCvt:
7131 MemVT = VA.getLocVT();
7132 break;
7135 Subtarget->isWindowsArm64EC()) &&
7136 "Indirect arguments should be scalable on most subtargets");
7137 MemVT = VA.getLocVT();
7138 break;
7139 case CCValAssign::SExt:
7140 ExtType = ISD::SEXTLOAD;
7141 break;
7142 case CCValAssign::ZExt:
7143 ExtType = ISD::ZEXTLOAD;
7144 break;
7145 case CCValAssign::AExt:
7146 ExtType = ISD::EXTLOAD;
7147 break;
7148 }
7149
7150 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7151 MemVT);
7152 }
7153
7154 if (VA.getLocInfo() == CCValAssign::Indirect) {
7155 assert((VA.getValVT().isScalableVT() ||
7156 Subtarget->isWindowsArm64EC()) &&
7157 "Indirect arguments should be scalable on most subtargets");
7158
7159 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7160 unsigned NumParts = 1;
7161 if (Ins[i].Flags.isInConsecutiveRegs()) {
7162 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7163 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7164 ++NumParts;
7165 }
7166
7167 MVT PartLoad = VA.getValVT();
7168 SDValue Ptr = ArgValue;
7169
7170 // Ensure we generate all loads for each tuple part, whilst updating the
7171 // pointer after each load correctly using vscale.
7172 while (NumParts > 0) {
7173 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7174 InVals.push_back(ArgValue);
7175 NumParts--;
7176 if (NumParts > 0) {
7177 SDValue BytesIncrement;
7178 if (PartLoad.isScalableVector()) {
7179 BytesIncrement = DAG.getVScale(
7180 DL, Ptr.getValueType(),
7181 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7182 } else {
7183 BytesIncrement = DAG.getConstant(
7184 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7185 Ptr.getValueType());
7186 }
7188 Flags.setNoUnsignedWrap(true);
7189 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7190 BytesIncrement, Flags);
7191 ExtraArgLocs++;
7192 i++;
7193 }
7194 }
7195 } else {
7196 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7197 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7198 ArgValue, DAG.getValueType(MVT::i32));
7199
7200 // i1 arguments are zero-extended to i8 by the caller. Emit a
7201 // hint to reflect this.
7202 if (Ins[i].isOrigArg()) {
7203 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7204 if (OrigArg->getType()->isIntegerTy(1)) {
7205 if (!Ins[i].Flags.isZExt()) {
7206 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7207 ArgValue.getValueType(), ArgValue);
7208 }
7209 }
7210 }
7211
7212 InVals.push_back(ArgValue);
7213 }
7214 }
7215 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7216
7217 // Insert the SMSTART if this is a locally streaming function and
7218 // make sure it is Glued to the last CopyFromReg value.
7219 if (IsLocallyStreaming) {
7220 SDValue PStateSM;
7221 if (Attrs.hasStreamingCompatibleInterface()) {
7222 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7225 FuncInfo->setPStateSMReg(Reg);
7226 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7227 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7229 } else
7230 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7232
7233 // Ensure that the SMSTART happens after the CopyWithChain such that its
7234 // chain result is used.
7235 for (unsigned I=0; I<InVals.size(); ++I) {
7237 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7238 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7239 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7240 InVals[I].getValueType());
7241 }
7242 }
7243
7244 // varargs
7245 if (isVarArg) {
7246 if (!Subtarget->isTargetDarwin() || IsWin64) {
7247 // The AAPCS variadic function ABI is identical to the non-variadic
7248 // one. As a result there may be more arguments in registers and we should
7249 // save them for future reference.
7250 // Win64 variadic functions also pass arguments in registers, but all float
7251 // arguments are passed in integer registers.
7252 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7253 }
7254
7255 // This will point to the next argument passed via stack.
7256 unsigned VarArgsOffset = CCInfo.getStackSize();
7257 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7258 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7259 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7260 FuncInfo->setVarArgsStackIndex(
7261 MFI.CreateFixedObject(4, VarArgsOffset, true));
7262
7263 if (MFI.hasMustTailInVarArgFunc()) {
7264 SmallVector<MVT, 2> RegParmTypes;
7265 RegParmTypes.push_back(MVT::i64);
7266 RegParmTypes.push_back(MVT::f128);
7267 // Compute the set of forwarded registers. The rest are scratch.
7269 FuncInfo->getForwardedMustTailRegParms();
7270 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7272
7273 // Conservatively forward X8, since it might be used for aggregate return.
7274 if (!CCInfo.isAllocated(AArch64::X8)) {
7275 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7276 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7277 }
7278 }
7279 }
7280
7281 // On Windows, InReg pointers must be returned, so record the pointer in a
7282 // virtual register at the start of the function so it can be returned in the
7283 // epilogue.
7284 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7285 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7286 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7287 Ins[I].Flags.isInReg()) &&
7288 Ins[I].Flags.isSRet()) {
7289 assert(!FuncInfo->getSRetReturnReg());
7290
7291 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7292 Register Reg =
7294 FuncInfo->setSRetReturnReg(Reg);
7295
7296 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7297 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7298 break;
7299 }
7300 }
7301 }
7302
7303 unsigned StackArgSize = CCInfo.getStackSize();
7304 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7305 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7306 // This is a non-standard ABI so by fiat I say we're allowed to make full
7307 // use of the stack area to be popped, which must be aligned to 16 bytes in
7308 // any case:
7309 StackArgSize = alignTo(StackArgSize, 16);
7310
7311 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7312 // a multiple of 16.
7313 FuncInfo->setArgumentStackToRestore(StackArgSize);
7314
7315 // This realignment carries over to the available bytes below. Our own
7316 // callers will guarantee the space is free by giving an aligned value to
7317 // CALLSEQ_START.
7318 }
7319 // Even if we're not expected to free up the space, it's useful to know how
7320 // much is there while considering tail calls (because we can reuse it).
7321 FuncInfo->setBytesInStackArgArea(StackArgSize);
7322
7323 if (Subtarget->hasCustomCallingConv())
7325
7326 // Conservatively assume the function requires the lazy-save mechanism.
7327 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7328 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7329 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7330 }
7331
7332 return Chain;
7333}
7334
7335void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7336 SelectionDAG &DAG,
7337 const SDLoc &DL,
7338 SDValue &Chain) const {
7340 MachineFrameInfo &MFI = MF.getFrameInfo();
7342 auto PtrVT = getPointerTy(DAG.getDataLayout());
7343 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7344
7346
7348 unsigned NumGPRArgRegs = GPRArgRegs.size();
7349 if (Subtarget->isWindowsArm64EC()) {
7350 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7351 // functions.
7352 NumGPRArgRegs = 4;
7353 }
7354 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7355
7356 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7357 int GPRIdx = 0;
7358 if (GPRSaveSize != 0) {
7359 if (IsWin64) {
7360 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7361 if (GPRSaveSize & 15)
7362 // The extra size here, if triggered, will always be 8.
7363 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7364 } else
7365 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7366
7367 SDValue FIN;
7368 if (Subtarget->isWindowsArm64EC()) {
7369 // With the Arm64EC ABI, we reserve the save area as usual, but we
7370 // compute its address relative to x4. For a normal AArch64->AArch64
7371 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7372 // different address.
7373 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7374 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7375 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7376 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7377 } else {
7378 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7379 }
7380
7381 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7382 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7383 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7384 SDValue Store =
7385 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7387 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7388 : MachinePointerInfo::getStack(MF, i * 8));
7389 MemOps.push_back(Store);
7390 FIN =
7391 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7392 }
7393 }
7394 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7395 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7396
7397 if (Subtarget->hasFPARMv8() && !IsWin64) {
7399 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7400 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7401
7402 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7403 int FPRIdx = 0;
7404 if (FPRSaveSize != 0) {
7405 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7406
7407 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7408
7409 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7410 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7411 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7412
7413 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7414 MachinePointerInfo::getStack(MF, i * 16));
7415 MemOps.push_back(Store);
7416 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7417 DAG.getConstant(16, DL, PtrVT));
7418 }
7419 }
7420 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7421 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7422 }
7423
7424 if (!MemOps.empty()) {
7425 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7426 }
7427}
7428
7429/// LowerCallResult - Lower the result values of a call into the
7430/// appropriate copies out of appropriate physical registers.
7431SDValue AArch64TargetLowering::LowerCallResult(
7432 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7433 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7434 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7435 SDValue ThisVal, bool RequiresSMChange) const {
7436 DenseMap<unsigned, SDValue> CopiedRegs;
7437 // Copy all of the result registers out of their specified physreg.
7438 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7439 CCValAssign VA = RVLocs[i];
7440
7441 // Pass 'this' value directly from the argument to return value, to avoid
7442 // reg unit interference
7443 if (i == 0 && isThisReturn) {
7444 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7445 "unexpected return calling convention register assignment");
7446 InVals.push_back(ThisVal);
7447 continue;
7448 }
7449
7450 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7451 // allows one use of a physreg per block.
7452 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7453 if (!Val) {
7454 Val =
7455 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7456 Chain = Val.getValue(1);
7457 InGlue = Val.getValue(2);
7458 CopiedRegs[VA.getLocReg()] = Val;
7459 }
7460
7461 switch (VA.getLocInfo()) {
7462 default:
7463 llvm_unreachable("Unknown loc info!");
7464 case CCValAssign::Full:
7465 break;
7466 case CCValAssign::BCvt:
7467 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7468 break;
7470 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7471 DAG.getConstant(32, DL, VA.getLocVT()));
7472 [[fallthrough]];
7473 case CCValAssign::AExt:
7474 [[fallthrough]];
7475 case CCValAssign::ZExt:
7476 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7477 break;
7478 }
7479
7480 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7482 Val);
7483
7484 InVals.push_back(Val);
7485 }
7486
7487 return Chain;
7488}
7489
7490/// Return true if the calling convention is one that we can guarantee TCO for.
7491static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7492 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7494}
7495
7496/// Return true if we might ever do TCO for calls with this calling convention.
7498 switch (CC) {
7499 case CallingConv::C:
7503 case CallingConv::Swift:
7505 case CallingConv::Tail:
7506 case CallingConv::Fast:
7507 return true;
7508 default:
7509 return false;
7510 }
7511}
7512
7514 const AArch64Subtarget *Subtarget,
7516 CCState &CCInfo) {
7517 const SelectionDAG &DAG = CLI.DAG;
7518 CallingConv::ID CalleeCC = CLI.CallConv;
7519 bool IsVarArg = CLI.IsVarArg;
7520 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7521 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7522
7523 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7524 // for the shadow store.
7525 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7526 CCInfo.AllocateStack(32, Align(16));
7527
7528 unsigned NumArgs = Outs.size();
7529 for (unsigned i = 0; i != NumArgs; ++i) {
7530 MVT ArgVT = Outs[i].VT;
7531 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7532
7533 bool UseVarArgCC = false;
7534 if (IsVarArg) {
7535 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7536 // too, so use the vararg CC to force them to integer registers.
7537 if (IsCalleeWin64) {
7538 UseVarArgCC = true;
7539 } else {
7540 UseVarArgCC = !Outs[i].IsFixed;
7541 }
7542 }
7543
7544 if (!UseVarArgCC) {
7545 // Get type of the original argument.
7546 EVT ActualVT =
7547 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7548 /*AllowUnknown*/ true);
7549 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7550 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7551 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7552 ArgVT = MVT::i8;
7553 else if (ActualMVT == MVT::i16)
7554 ArgVT = MVT::i16;
7555 }
7556
7557 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7558 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7559 assert(!Res && "Call operand has unhandled type");
7560 (void)Res;
7561 }
7562}
7563
7564bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7565 const CallLoweringInfo &CLI) const {
7566 CallingConv::ID CalleeCC = CLI.CallConv;
7567 if (!mayTailCallThisCC(CalleeCC))
7568 return false;
7569
7570 SDValue Callee = CLI.Callee;
7571 bool IsVarArg = CLI.IsVarArg;
7572 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7573 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7574 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7575 const SelectionDAG &DAG = CLI.DAG;
7577 const Function &CallerF = MF.getFunction();
7578 CallingConv::ID CallerCC = CallerF.getCallingConv();
7579
7580 // SME Streaming functions are not eligible for TCO as they may require
7581 // the streaming mode or ZA to be restored after returning from the call.
7582 SMEAttrs CallerAttrs(MF.getFunction());
7583 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7584 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7585 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7586 CallerAttrs.hasStreamingBody())
7587 return false;
7588
7589 // Functions using the C or Fast calling convention that have an SVE signature
7590 // preserve more registers and should assume the SVE_VectorCall CC.
7591 // The check for matching callee-saved regs will determine whether it is
7592 // eligible for TCO.
7593 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7596
7597 bool CCMatch = CallerCC == CalleeCC;
7598
7599 // When using the Windows calling convention on a non-windows OS, we want
7600 // to back up and restore X18 in such functions; we can't do a tail call
7601 // from those functions.
7602 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7603 CalleeCC != CallingConv::Win64)
7604 return false;
7605
7606 // Byval parameters hand the function a pointer directly into the stack area
7607 // we want to reuse during a tail call. Working around this *is* possible (see
7608 // X86) but less efficient and uglier in LowerCall.
7609 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7610 e = CallerF.arg_end();
7611 i != e; ++i) {
7612 if (i->hasByValAttr())
7613 return false;
7614
7615 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7616 // In this case, it is necessary to save/restore X0 in the callee. Tail
7617 // call opt interferes with this. So we disable tail call opt when the
7618 // caller has an argument with "inreg" attribute.
7619
7620 // FIXME: Check whether the callee also has an "inreg" argument.
7621 if (i->hasInRegAttr())
7622 return false;
7623 }
7624
7625 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7626 return CCMatch;
7627
7628 // Externally-defined functions with weak linkage should not be
7629 // tail-called on AArch64 when the OS does not support dynamic
7630 // pre-emption of symbols, as the AAELF spec requires normal calls
7631 // to undefined weak functions to be replaced with a NOP or jump to the
7632 // next instruction. The behaviour of branch instructions in this
7633 // situation (as used for tail calls) is implementation-defined, so we
7634 // cannot rely on the linker replacing the tail call with a return.
7635 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7636 const GlobalValue *GV = G->getGlobal();
7638 if (GV->hasExternalWeakLinkage() &&
7639 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7640 return false;
7641 }
7642
7643 // Now we search for cases where we can use a tail call without changing the
7644 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7645 // concept.
7646
7647 // I want anyone implementing a new calling convention to think long and hard
7648 // about this assert.
7649 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7650 "Unexpected variadic calling convention");
7651
7652 LLVMContext &C = *DAG.getContext();
7653 // Check that the call results are passed in the same way.
7654 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7655 CCAssignFnForCall(CalleeCC, IsVarArg),
7656 CCAssignFnForCall(CallerCC, IsVarArg)))
7657 return false;
7658 // The callee has to preserve all registers the caller needs to preserve.
7659 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7660 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7661 if (!CCMatch) {
7662 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7663 if (Subtarget->hasCustomCallingConv()) {
7664 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7665 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7666 }
7667 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7668 return false;
7669 }
7670
7671 // Nothing more to check if the callee is taking no arguments
7672 if (Outs.empty())
7673 return true;
7674
7676 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7677
7678 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7679
7680 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7681 // When we are musttail, additional checks have been done and we can safely ignore this check
7682 // At least two cases here: if caller is fastcc then we can't have any
7683 // memory arguments (we'd be expected to clean up the stack afterwards). If
7684 // caller is C then we could potentially use its argument area.
7685
7686 // FIXME: for now we take the most conservative of these in both cases:
7687 // disallow all variadic memory operands.
7688 for (const CCValAssign &ArgLoc : ArgLocs)
7689 if (!ArgLoc.isRegLoc())
7690 return false;
7691 }
7692
7693 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7694
7695 // If any of the arguments is passed indirectly, it must be SVE, so the
7696 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7697 // allocate space on the stack. That is why we determine this explicitly here
7698 // the call cannot be a tailcall.
7699 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7700 assert((A.getLocInfo() != CCValAssign::Indirect ||
7701 A.getValVT().isScalableVector() ||
7702 Subtarget->isWindowsArm64EC()) &&
7703 "Expected value to be scalable");
7704 return A.getLocInfo() == CCValAssign::Indirect;
7705 }))
7706 return false;
7707
7708 // If the stack arguments for this call do not fit into our own save area then
7709 // the call cannot be made tail.
7710 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7711 return false;
7712
7713 const MachineRegisterInfo &MRI = MF.getRegInfo();
7714 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7715 return false;
7716
7717 return true;
7718}
7719
7720SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7721 SelectionDAG &DAG,
7722 MachineFrameInfo &MFI,
7723 int ClobberedFI) const {
7724 SmallVector<SDValue, 8> ArgChains;
7725 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7726 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7727
7728 // Include the original chain at the beginning of the list. When this is
7729 // used by target LowerCall hooks, this helps legalize find the
7730 // CALLSEQ_BEGIN node.
7731 ArgChains.push_back(Chain);
7732
7733 // Add a chain value for each stack argument corresponding
7734 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7735 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7736 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7737 if (FI->getIndex() < 0) {
7738 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7739 int64_t InLastByte = InFirstByte;
7740 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7741
7742 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7743 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7744 ArgChains.push_back(SDValue(L, 1));
7745 }
7746
7747 // Build a tokenfactor for all the chains.
7748 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7749}
7750
7751bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7752 bool TailCallOpt) const {
7753 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7754 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7755}
7756
7757// Check if the value is zero-extended from i1 to i8
7758static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7759 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7760 if (SizeInBits < 8)
7761 return false;
7762
7763 APInt RequredZero(SizeInBits, 0xFE);
7764 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7765 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7766 return ZExtBool;
7767}
7768
7769void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7770 SDNode *Node) const {
7771 // Live-in physreg copies that are glued to SMSTART are applied as
7772 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7773 // register allocator to pass call args in callee saved regs, without extra
7774 // copies to avoid these fake clobbers of actually-preserved GPRs.
7775 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7776 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7777 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7778 if (MachineOperand &MO = MI.getOperand(I);
7779 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7780 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7781 AArch64::GPR64RegClass.contains(MO.getReg())))
7782 MI.removeOperand(I);
7783
7784 // The SVE vector length can change when entering/leaving streaming mode.
7785 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7786 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7787 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7788 /*IsImplicit=*/true));
7789 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7790 /*IsImplicit=*/true));
7791 }
7792 }
7793
7794 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7795 // have nothing to do with VG, were it not that they are used to materialise a
7796 // frame-address. If they contain a frame-index to a scalable vector, this
7797 // will likely require an ADDVL instruction to materialise the address, thus
7798 // reading VG.
7799 const MachineFunction &MF = *MI.getMF();
7801 (MI.getOpcode() == AArch64::ADDXri ||
7802 MI.getOpcode() == AArch64::SUBXri)) {
7803 const MachineOperand &MO = MI.getOperand(1);
7804 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7806 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7807 /*IsImplicit=*/true));
7808 }
7809}
7810
7812 bool Enable, SDValue Chain,
7813 SDValue InGlue,
7814 unsigned Condition,
7815 SDValue PStateSM) const {
7818 FuncInfo->setHasStreamingModeChanges(true);
7819
7820 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7821 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7822 SDValue MSROp =
7823 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7824 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7825 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7826 if (Condition != AArch64SME::Always) {
7827 assert(PStateSM && "PStateSM should be defined");
7828 Ops.push_back(PStateSM);
7829 }
7830 Ops.push_back(RegMask);
7831
7832 if (InGlue)
7833 Ops.push_back(InGlue);
7834
7835 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7836 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7837}
7838
7839static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7840 const SMEAttrs &CalleeAttrs) {
7841 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7842 CallerAttrs.hasStreamingBody())
7843 return AArch64SME::Always;
7844 if (CalleeAttrs.hasNonStreamingInterface())
7846 if (CalleeAttrs.hasStreamingInterface())
7848
7849 llvm_unreachable("Unsupported attributes");
7850}
7851
7852/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7853/// and add input and output parameter nodes.
7854SDValue
7855AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7856 SmallVectorImpl<SDValue> &InVals) const {
7857 SelectionDAG &DAG = CLI.DAG;
7858 SDLoc &DL = CLI.DL;
7859 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7860 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7862 SDValue Chain = CLI.Chain;
7863 SDValue Callee = CLI.Callee;
7864 bool &IsTailCall = CLI.IsTailCall;
7865 CallingConv::ID &CallConv = CLI.CallConv;
7866 bool IsVarArg = CLI.IsVarArg;
7867
7870 bool IsThisReturn = false;
7871
7873 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7874 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7875 bool IsSibCall = false;
7876 bool GuardWithBTI = false;
7877
7878 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7879 !Subtarget->noBTIAtReturnTwice()) {
7880 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7881 }
7882
7883 // Analyze operands of the call, assigning locations to each operand.
7885 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7886
7887 if (IsVarArg) {
7888 unsigned NumArgs = Outs.size();
7889
7890 for (unsigned i = 0; i != NumArgs; ++i) {
7891 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7892 report_fatal_error("Passing SVE types to variadic functions is "
7893 "currently not supported");
7894 }
7895 }
7896
7897 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7898
7899 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7900 // Assign locations to each value returned by this call.
7902 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7903 *DAG.getContext());
7904 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7905
7906 // Check callee args/returns for SVE registers and set calling convention
7907 // accordingly.
7908 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7909 auto HasSVERegLoc = [](CCValAssign &Loc) {
7910 if (!Loc.isRegLoc())
7911 return false;
7912 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7913 AArch64::PPRRegClass.contains(Loc.getLocReg());
7914 };
7915 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7917 }
7918
7919 if (IsTailCall) {
7920 // Check if it's really possible to do a tail call.
7921 IsTailCall = isEligibleForTailCallOptimization(CLI);
7922
7923 // A sibling call is one where we're under the usual C ABI and not planning
7924 // to change that but can still do a tail call:
7925 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7926 CallConv != CallingConv::SwiftTail)
7927 IsSibCall = true;
7928
7929 if (IsTailCall)
7930 ++NumTailCalls;
7931 }
7932
7933 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7934 report_fatal_error("failed to perform tail call elimination on a call "
7935 "site marked musttail");
7936
7937 // Get a count of how many bytes are to be pushed on the stack.
7938 unsigned NumBytes = CCInfo.getStackSize();
7939
7940 if (IsSibCall) {
7941 // Since we're not changing the ABI to make this a tail call, the memory
7942 // operands are already available in the caller's incoming argument space.
7943 NumBytes = 0;
7944 }
7945
7946 // FPDiff is the byte offset of the call's argument area from the callee's.
7947 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7948 // by this amount for a tail call. In a sibling call it must be 0 because the
7949 // caller will deallocate the entire stack and the callee still expects its
7950 // arguments to begin at SP+0. Completely unused for non-tail calls.
7951 int FPDiff = 0;
7952
7953 if (IsTailCall && !IsSibCall) {
7954 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7955
7956 // Since callee will pop argument stack as a tail call, we must keep the
7957 // popped size 16-byte aligned.
7958 NumBytes = alignTo(NumBytes, 16);
7959
7960 // FPDiff will be negative if this tail call requires more space than we
7961 // would automatically have in our incoming argument space. Positive if we
7962 // can actually shrink the stack.
7963 FPDiff = NumReusableBytes - NumBytes;
7964
7965 // Update the required reserved area if this is the tail call requiring the
7966 // most argument stack space.
7967 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7968 FuncInfo->setTailCallReservedStack(-FPDiff);
7969
7970 // The stack pointer must be 16-byte aligned at all times it's used for a
7971 // memory operation, which in practice means at *all* times and in
7972 // particular across call boundaries. Therefore our own arguments started at
7973 // a 16-byte aligned SP and the delta applied for the tail call should
7974 // satisfy the same constraint.
7975 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7976 }
7977
7978 // Determine whether we need any streaming mode changes.
7979 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7980 if (CLI.CB)
7981 CalleeAttrs = SMEAttrs(*CLI.CB);
7982 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7983 CalleeAttrs = SMEAttrs(ES->getSymbol());
7984
7985 auto DescribeCallsite =
7987 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7988 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7989 R << ore::NV("Callee", ES->getSymbol());
7990 else if (CLI.CB && CLI.CB->getCalledFunction())
7991 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7992 else
7993 R << "unknown callee";
7994 R << "'";
7995 return R;
7996 };
7997
7998 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7999 if (RequiresLazySave) {
8000 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
8002 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
8004 SDValue NumZaSaveSlicesAddr =
8005 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8006 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8007 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8008 DAG.getConstant(1, DL, MVT::i32));
8009 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8010 MPI, MVT::i16);
8011 Chain = DAG.getNode(
8012 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8013 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8014 TPIDR2ObjAddr);
8016 ORE.emit([&]() {
8017 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8018 CLI.CB)
8019 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8020 &MF.getFunction());
8021 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8022 });
8023 }
8024
8025 SDValue PStateSM;
8026 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8027 if (RequiresSMChange) {
8028 if (CallerAttrs.hasStreamingInterfaceOrBody())
8029 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8030 else if (CallerAttrs.hasNonStreamingInterface())
8031 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8032 else
8033 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8035 ORE.emit([&]() {
8036 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8037 CLI.CB)
8038 : OptimizationRemarkAnalysis("sme", "SMETransition",
8039 &MF.getFunction());
8040 DescribeCallsite(R) << " requires a streaming mode transition";
8041 return R;
8042 });
8043 }
8044
8045 SDValue ZTFrameIdx;
8046 MachineFrameInfo &MFI = MF.getFrameInfo();
8047 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8048
8049 // If the caller has ZT0 state which will not be preserved by the callee,
8050 // spill ZT0 before the call.
8051 if (ShouldPreserveZT0) {
8052 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8053 ZTFrameIdx = DAG.getFrameIndex(
8054 ZTObj,
8056
8057 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8058 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8059 }
8060
8061 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8062 // PSTATE.ZA before the call if there is no lazy-save active.
8063 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8064 assert((!DisableZA || !RequiresLazySave) &&
8065 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8066
8067 if (DisableZA)
8068 Chain = DAG.getNode(
8069 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8070 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8071 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8072
8073 // Adjust the stack pointer for the new arguments...
8074 // These operations are automatically eliminated by the prolog/epilog pass
8075 if (!IsSibCall)
8076 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8077
8078 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8080
8082 SmallSet<unsigned, 8> RegsUsed;
8083 SmallVector<SDValue, 8> MemOpChains;
8084 auto PtrVT = getPointerTy(DAG.getDataLayout());
8085
8086 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8087 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8088 for (const auto &F : Forwards) {
8089 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8090 RegsToPass.emplace_back(F.PReg, Val);
8091 }
8092 }
8093
8094 // Walk the register/memloc assignments, inserting copies/loads.
8095 unsigned ExtraArgLocs = 0;
8096 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8097 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8098 SDValue Arg = OutVals[i];
8099 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8100
8101 // Promote the value if needed.
8102 switch (VA.getLocInfo()) {
8103 default:
8104 llvm_unreachable("Unknown loc info!");
8105 case CCValAssign::Full:
8106 break;
8107 case CCValAssign::SExt:
8108 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8109 break;
8110 case CCValAssign::ZExt:
8111 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8112 break;
8113 case CCValAssign::AExt:
8114 if (Outs[i].ArgVT == MVT::i1) {
8115 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8116 //
8117 // Check if we actually have to do this, because the value may
8118 // already be zero-extended.
8119 //
8120 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8121 // and rely on DAGCombiner to fold this, because the following
8122 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8123 //
8124 // (ext (zext x)) -> (zext x)
8125 //
8126 // This will give us (zext i32), which we cannot remove, so
8127 // try to check this beforehand.
8128 if (!checkZExtBool(Arg, DAG)) {
8129 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8130 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8131 }
8132 }
8133 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8134 break;
8136 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8137 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8138 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8139 DAG.getConstant(32, DL, VA.getLocVT()));
8140 break;
8141 case CCValAssign::BCvt:
8142 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8143 break;
8144 case CCValAssign::Trunc:
8145 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8146 break;
8147 case CCValAssign::FPExt:
8148 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8149 break;
8151 bool isScalable = VA.getValVT().isScalableVT();
8152 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8153 "Indirect arguments should be scalable on most subtargets");
8154
8155 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8156 uint64_t PartSize = StoreSize;
8157 unsigned NumParts = 1;
8158 if (Outs[i].Flags.isInConsecutiveRegs()) {
8159 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8160 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8161 ++NumParts;
8162 StoreSize *= NumParts;
8163 }
8164
8165 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8166 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8167 MachineFrameInfo &MFI = MF.getFrameInfo();
8168 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8169 if (isScalable)
8171
8175 SDValue SpillSlot = Ptr;
8176
8177 // Ensure we generate all stores for each tuple part, whilst updating the
8178 // pointer after each store correctly using vscale.
8179 while (NumParts) {
8180 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8181 MemOpChains.push_back(Store);
8182
8183 NumParts--;
8184 if (NumParts > 0) {
8185 SDValue BytesIncrement;
8186 if (isScalable) {
8187 BytesIncrement = DAG.getVScale(
8188 DL, Ptr.getValueType(),
8189 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8190 } else {
8191 BytesIncrement = DAG.getConstant(
8192 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8193 Ptr.getValueType());
8194 }
8196 Flags.setNoUnsignedWrap(true);
8197
8198 MPI = MachinePointerInfo(MPI.getAddrSpace());
8199 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8200 BytesIncrement, Flags);
8201 ExtraArgLocs++;
8202 i++;
8203 }
8204 }
8205
8206 Arg = SpillSlot;
8207 break;
8208 }
8209
8210 if (VA.isRegLoc()) {
8211 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8212 Outs[0].VT == MVT::i64) {
8213 assert(VA.getLocVT() == MVT::i64 &&
8214 "unexpected calling convention register assignment");
8215 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8216 "unexpected use of 'returned'");
8217 IsThisReturn = true;
8218 }
8219 if (RegsUsed.count(VA.getLocReg())) {
8220 // If this register has already been used then we're trying to pack
8221 // parts of an [N x i32] into an X-register. The extension type will
8222 // take care of putting the two halves in the right place but we have to
8223 // combine them.
8224 SDValue &Bits =
8225 llvm::find_if(RegsToPass,
8226 [=](const std::pair<unsigned, SDValue> &Elt) {
8227 return Elt.first == VA.getLocReg();
8228 })
8229 ->second;
8230 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8231 // Call site info is used for function's parameter entry value
8232 // tracking. For now we track only simple cases when parameter
8233 // is transferred through whole register.
8235 [&VA](MachineFunction::ArgRegPair ArgReg) {
8236 return ArgReg.Reg == VA.getLocReg();
8237 });
8238 } else {
8239 // Add an extra level of indirection for streaming mode changes by
8240 // using a pseudo copy node that cannot be rematerialised between a
8241 // smstart/smstop and the call by the simple register coalescer.
8242 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8244 Arg.getValueType(), Arg);
8245 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8246 RegsUsed.insert(VA.getLocReg());
8247 const TargetOptions &Options = DAG.getTarget().Options;
8248 if (Options.EmitCallSiteInfo)
8249 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8250 }
8251 } else {
8252 assert(VA.isMemLoc());
8253
8254 SDValue DstAddr;
8255 MachinePointerInfo DstInfo;
8256
8257 // FIXME: This works on big-endian for composite byvals, which are the
8258 // common case. It should also work for fundamental types too.
8259 uint32_t BEAlign = 0;
8260 unsigned OpSize;
8261 if (VA.getLocInfo() == CCValAssign::Indirect ||
8263 OpSize = VA.getLocVT().getFixedSizeInBits();
8264 else
8265 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8266 : VA.getValVT().getSizeInBits();
8267 OpSize = (OpSize + 7) / 8;
8268 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8269 !Flags.isInConsecutiveRegs()) {
8270 if (OpSize < 8)
8271 BEAlign = 8 - OpSize;
8272 }
8273 unsigned LocMemOffset = VA.getLocMemOffset();
8274 int32_t Offset = LocMemOffset + BEAlign;
8275 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8276 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8277
8278 if (IsTailCall) {
8279 Offset = Offset + FPDiff;
8280 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8281
8282 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8283 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8284
8285 // Make sure any stack arguments overlapping with where we're storing
8286 // are loaded before this eventual operation. Otherwise they'll be
8287 // clobbered.
8288 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8289 } else {
8290 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8291
8292 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8293 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8294 }
8295
8296 if (Outs[i].Flags.isByVal()) {
8297 SDValue SizeNode =
8298 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8299 SDValue Cpy = DAG.getMemcpy(
8300 Chain, DL, DstAddr, Arg, SizeNode,
8301 Outs[i].Flags.getNonZeroByValAlign(),
8302 /*isVol = */ false, /*AlwaysInline = */ false,
8303 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8304
8305 MemOpChains.push_back(Cpy);
8306 } else {
8307 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8308 // promoted to a legal register type i32, we should truncate Arg back to
8309 // i1/i8/i16.
8310 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8311 VA.getValVT() == MVT::i16)
8312 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8313
8314 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8315 MemOpChains.push_back(Store);
8316 }
8317 }
8318 }
8319
8320 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8321 SDValue ParamPtr = StackPtr;
8322 if (IsTailCall) {
8323 // Create a dummy object at the top of the stack that can be used to get
8324 // the SP after the epilogue
8325 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8326 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8327 }
8328
8329 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8330 // describing the argument list. x4 contains the address of the
8331 // first stack parameter. x5 contains the size in bytes of all parameters
8332 // passed on the stack.
8333 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8334 RegsToPass.emplace_back(AArch64::X5,
8335 DAG.getConstant(NumBytes, DL, MVT::i64));
8336 }
8337
8338 if (!MemOpChains.empty())
8339 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8340
8341 SDValue InGlue;
8342 if (RequiresSMChange) {
8343 SDValue NewChain = changeStreamingMode(
8344 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8345 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8346 Chain = NewChain.getValue(0);
8347 InGlue = NewChain.getValue(1);
8348 }
8349
8350 // Build a sequence of copy-to-reg nodes chained together with token chain
8351 // and flag operands which copy the outgoing args into the appropriate regs.
8352 for (auto &RegToPass : RegsToPass) {
8353 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8354 RegToPass.second, InGlue);
8355 InGlue = Chain.getValue(1);
8356 }
8357
8358 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8359 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8360 // node so that legalize doesn't hack it.
8361 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8362 auto GV = G->getGlobal();
8363 unsigned OpFlags =
8365 if (OpFlags & AArch64II::MO_GOT) {
8366 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8367 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8368 } else {
8369 const GlobalValue *GV = G->getGlobal();
8370 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8371 }
8372 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8373 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8374 Subtarget->isTargetMachO()) ||
8376 const char *Sym = S->getSymbol();
8377 if (UseGot) {
8379 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8380 } else {
8381 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8382 }
8383 }
8384
8385 // We don't usually want to end the call-sequence here because we would tidy
8386 // the frame up *after* the call, however in the ABI-changing tail-call case
8387 // we've carefully laid out the parameters so that when sp is reset they'll be
8388 // in the correct location.
8389 if (IsTailCall && !IsSibCall) {
8390 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8391 InGlue = Chain.getValue(1);
8392 }
8393
8394 std::vector<SDValue> Ops;
8395 Ops.push_back(Chain);
8396 Ops.push_back(Callee);
8397
8398 if (IsTailCall) {
8399 // Each tail call may have to adjust the stack by a different amount, so
8400 // this information must travel along with the operation for eventual
8401 // consumption by emitEpilogue.
8402 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8403 }
8404
8405 // Add argument registers to the end of the list so that they are known live
8406 // into the call.
8407 for (auto &RegToPass : RegsToPass)
8408 Ops.push_back(DAG.getRegister(RegToPass.first,
8409 RegToPass.second.getValueType()));
8410
8411 // Add a register mask operand representing the call-preserved registers.
8412 const uint32_t *Mask;
8413 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8414 if (IsThisReturn) {
8415 // For 'this' returns, use the X0-preserving mask if applicable
8416 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8417 if (!Mask) {
8418 IsThisReturn = false;
8419 Mask = TRI->getCallPreservedMask(MF, CallConv);
8420 }
8421 } else
8422 Mask = TRI->getCallPreservedMask(MF, CallConv);
8423
8424 if (Subtarget->hasCustomCallingConv())
8425 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8426
8427 if (TRI->isAnyArgRegReserved(MF))
8428 TRI->emitReservedArgRegCallError(MF);
8429
8430 assert(Mask && "Missing call preserved mask for calling convention");
8431 Ops.push_back(DAG.getRegisterMask(Mask));
8432
8433 if (InGlue.getNode())
8434 Ops.push_back(InGlue);
8435
8436 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8437
8438 // If we're doing a tall call, use a TC_RETURN here rather than an
8439 // actual call instruction.
8440 if (IsTailCall) {
8442 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8443
8444 if (IsCFICall)
8445 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8446
8447 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8448 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8449 return Ret;
8450 }
8451
8452 unsigned CallOpc = AArch64ISD::CALL;
8453 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8454 // be expanded to the call, directly followed by a special marker sequence and
8455 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8456 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8457 assert(!IsTailCall &&
8458 "tail calls cannot be marked with clang.arc.attachedcall");
8459 CallOpc = AArch64ISD::CALL_RVMARKER;
8460
8461 // Add a target global address for the retainRV/claimRV runtime function
8462 // just before the call target.
8463 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8464 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8465 Ops.insert(Ops.begin() + 1, GA);
8466 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8468 } else if (GuardWithBTI) {
8469 CallOpc = AArch64ISD::CALL_BTI;
8470 }
8471
8472 // Returns a chain and a flag for retval copy to use.
8473 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8474
8475 if (IsCFICall)
8476 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8477
8478 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8479 InGlue = Chain.getValue(1);
8480 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8481
8482 uint64_t CalleePopBytes =
8483 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8484
8485 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8486 InGlue = Chain.getValue(1);
8487
8488 // Handle result values, copying them out of physregs into vregs that we
8489 // return.
8490 SDValue Result = LowerCallResult(
8491 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8492 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8493
8494 if (!Ins.empty())
8495 InGlue = Result.getValue(Result->getNumValues() - 1);
8496
8497 if (RequiresSMChange) {
8498 assert(PStateSM && "Expected a PStateSM to be set");
8500 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8501 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8502 }
8503
8504 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8505 // Unconditionally resume ZA.
8506 Result = DAG.getNode(
8507 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8508 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8509 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8510
8511 if (ShouldPreserveZT0)
8512 Result =
8513 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8514 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8515
8516 if (RequiresLazySave) {
8517 // Conditionally restore the lazy save using a pseudo node.
8518 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8519 SDValue RegMask = DAG.getRegisterMask(
8520 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8521 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8522 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8523 SDValue TPIDR2_EL0 = DAG.getNode(
8524 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8525 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8526
8527 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8528 // RESTORE_ZA pseudo.
8529 SDValue Glue;
8530 SDValue TPIDR2Block = DAG.getFrameIndex(
8532 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8533 Result =
8534 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8535 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8536 RestoreRoutine, RegMask, Result.getValue(1)});
8537
8538 // Finally reset the TPIDR2_EL0 register to 0.
8539 Result = DAG.getNode(
8540 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8541 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8542 DAG.getConstant(0, DL, MVT::i64));
8543 }
8544
8545 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8546 for (unsigned I = 0; I < InVals.size(); ++I) {
8547 // The smstart/smstop is chained as part of the call, but when the
8548 // resulting chain is discarded (which happens when the call is not part
8549 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8550 // smstart/smstop is chained to the result value. We can do that by doing
8551 // a vreg -> vreg copy.
8553 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8554 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8555 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8556 InVals[I].getValueType());
8557 }
8558 }
8559
8560 return Result;
8561}
8562
8563bool AArch64TargetLowering::CanLowerReturn(
8564 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8565 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8566 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8568 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8569 return CCInfo.CheckReturn(Outs, RetCC);
8570}
8571
8572SDValue
8573AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8574 bool isVarArg,
8576 const SmallVectorImpl<SDValue> &OutVals,
8577 const SDLoc &DL, SelectionDAG &DAG) const {
8578 auto &MF = DAG.getMachineFunction();
8579 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8580
8581 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8583 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8584 CCInfo.AnalyzeReturn(Outs, RetCC);
8585
8586 // Copy the result values into the output registers.
8587 SDValue Glue;
8589 SmallSet<unsigned, 4> RegsUsed;
8590 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8591 ++i, ++realRVLocIdx) {
8592 CCValAssign &VA = RVLocs[i];
8593 assert(VA.isRegLoc() && "Can only return in registers!");
8594 SDValue Arg = OutVals[realRVLocIdx];
8595
8596 switch (VA.getLocInfo()) {
8597 default:
8598 llvm_unreachable("Unknown loc info!");
8599 case CCValAssign::Full:
8600 if (Outs[i].ArgVT == MVT::i1) {
8601 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8602 // value. This is strictly redundant on Darwin (which uses "zeroext
8603 // i1"), but will be optimised out before ISel.
8604 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8605 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8606 }
8607 break;
8608 case CCValAssign::BCvt:
8609 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8610 break;
8611 case CCValAssign::AExt:
8612 case CCValAssign::ZExt:
8613 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8614 break;
8616 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8617 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8618 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8619 DAG.getConstant(32, DL, VA.getLocVT()));
8620 break;
8621 }
8622
8623 if (RegsUsed.count(VA.getLocReg())) {
8624 SDValue &Bits =
8625 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8626 return Elt.first == VA.getLocReg();
8627 })->second;
8628 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8629 } else {
8630 RetVals.emplace_back(VA.getLocReg(), Arg);
8631 RegsUsed.insert(VA.getLocReg());
8632 }
8633 }
8634
8635 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8636
8637 // Emit SMSTOP before returning from a locally streaming function
8638 SMEAttrs FuncAttrs(MF.getFunction());
8639 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8640 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8641 Register Reg = FuncInfo->getPStateSMReg();
8642 assert(Reg.isValid() && "PStateSM Register is invalid");
8643 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8644 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8645 /*Glue*/ SDValue(),
8647 } else
8648 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8649 /*Glue*/ SDValue(), AArch64SME::Always);
8650 Glue = Chain.getValue(1);
8651 }
8652
8653 SmallVector<SDValue, 4> RetOps(1, Chain);
8654 for (auto &RetVal : RetVals) {
8655 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8656 isPassedInFPR(RetVal.second.getValueType()))
8657 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8658 RetVal.second.getValueType(), RetVal.second);
8659 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8660 Glue = Chain.getValue(1);
8661 RetOps.push_back(
8662 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8663 }
8664
8665 // Windows AArch64 ABIs require that for returning structs by value we copy
8666 // the sret argument into X0 for the return.
8667 // We saved the argument into a virtual register in the entry block,
8668 // so now we copy the value out and into X0.
8669 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8670 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8672
8673 unsigned RetValReg = AArch64::X0;
8674 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8675 RetValReg = AArch64::X8;
8676 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8677 Glue = Chain.getValue(1);
8678
8679 RetOps.push_back(
8680 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8681 }
8682
8683 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8684 if (I) {
8685 for (; *I; ++I) {
8686 if (AArch64::GPR64RegClass.contains(*I))
8687 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8688 else if (AArch64::FPR64RegClass.contains(*I))
8689 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8690 else
8691 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8692 }
8693 }
8694
8695 RetOps[0] = Chain; // Update chain.
8696
8697 // Add the glue if we have it.
8698 if (Glue.getNode())
8699 RetOps.push_back(Glue);
8700
8701 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8702 // ARM64EC entry thunks use a special return sequence: instead of a regular
8703 // "ret" instruction, they need to explicitly call the emulator.
8704 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8705 SDValue Arm64ECRetDest =
8706 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8707 Arm64ECRetDest =
8708 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8709 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8711 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8712 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8713 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8714 }
8715
8716 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8717}
8718
8719//===----------------------------------------------------------------------===//
8720// Other Lowering Code
8721//===----------------------------------------------------------------------===//
8722
8723SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8724 SelectionDAG &DAG,
8725 unsigned Flag) const {
8726 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8727 N->getOffset(), Flag);
8728}
8729
8730SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8731 SelectionDAG &DAG,
8732 unsigned Flag) const {
8733 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8734}
8735
8736SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8737 SelectionDAG &DAG,
8738 unsigned Flag) const {
8739 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8740 N->getOffset(), Flag);
8741}
8742
8743SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8744 SelectionDAG &DAG,
8745 unsigned Flag) const {
8746 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8747}
8748
8749SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8750 SelectionDAG &DAG,
8751 unsigned Flag) const {
8752 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8753}
8754
8755// (loadGOT sym)
8756template <class NodeTy>
8757SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8758 unsigned Flags) const {
8759 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8760 SDLoc DL(N);
8761 EVT Ty = getPointerTy(DAG.getDataLayout());
8762 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8763 // FIXME: Once remat is capable of dealing with instructions with register
8764 // operands, expand this into two nodes instead of using a wrapper node.
8765 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8766}
8767
8768// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8769template <class NodeTy>
8770SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8771 unsigned Flags) const {
8772 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8773 SDLoc DL(N);
8774 EVT Ty = getPointerTy(DAG.getDataLayout());
8775 const unsigned char MO_NC = AArch64II::MO_NC;
8776 return DAG.getNode(
8778 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8779 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8780 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8781 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8782}
8783
8784// (addlow (adrp %hi(sym)) %lo(sym))
8785template <class NodeTy>
8786SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8787 unsigned Flags) const {
8788 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8789 SDLoc DL(N);
8790 EVT Ty = getPointerTy(DAG.getDataLayout());
8791 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8792 SDValue Lo = getTargetNode(N, Ty, DAG,
8795 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8796}
8797
8798// (adr sym)
8799template <class NodeTy>
8800SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8801 unsigned Flags) const {
8802 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8803 SDLoc DL(N);
8804 EVT Ty = getPointerTy(DAG.getDataLayout());
8805 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8806 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8807}
8808
8809SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8810 SelectionDAG &DAG) const {
8811 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8812 const GlobalValue *GV = GN->getGlobal();
8813 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8814
8815 if (OpFlags != AArch64II::MO_NO_FLAG)
8816 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8817 "unexpected offset in global node");
8818
8819 // This also catches the large code model case for Darwin, and tiny code
8820 // model with got relocations.
8821 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8822 return getGOT(GN, DAG, OpFlags);
8823 }
8824
8828 Result = getAddrLarge(GN, DAG, OpFlags);
8829 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8830 Result = getAddrTiny(GN, DAG, OpFlags);
8831 } else {
8832 Result = getAddr(GN, DAG, OpFlags);
8833 }
8834 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8835 SDLoc DL(GN);
8837 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8839 return Result;
8840}
8841
8842/// Convert a TLS address reference into the correct sequence of loads
8843/// and calls to compute the variable's address (for Darwin, currently) and
8844/// return an SDValue containing the final node.
8845
8846/// Darwin only has one TLS scheme which must be capable of dealing with the
8847/// fully general situation, in the worst case. This means:
8848/// + "extern __thread" declaration.
8849/// + Defined in a possibly unknown dynamic library.
8850///
8851/// The general system is that each __thread variable has a [3 x i64] descriptor
8852/// which contains information used by the runtime to calculate the address. The
8853/// only part of this the compiler needs to know about is the first xword, which
8854/// contains a function pointer that must be called with the address of the
8855/// entire descriptor in "x0".
8856///
8857/// Since this descriptor may be in a different unit, in general even the
8858/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8859/// is:
8860/// adrp x0, _var@TLVPPAGE
8861/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8862/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8863/// ; the function pointer
8864/// blr x1 ; Uses descriptor address in x0
8865/// ; Address of _var is now in x0.
8866///
8867/// If the address of _var's descriptor *is* known to the linker, then it can
8868/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8869/// a slight efficiency gain.
8870SDValue
8871AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8872 SelectionDAG &DAG) const {
8873 assert(Subtarget->isTargetDarwin() &&
8874 "This function expects a Darwin target");
8875
8876 SDLoc DL(Op);
8877 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8878 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8879 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8880
8881 SDValue TLVPAddr =
8882 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8883 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8884
8885 // The first entry in the descriptor is a function pointer that we must call
8886 // to obtain the address of the variable.
8887 SDValue Chain = DAG.getEntryNode();
8888 SDValue FuncTLVGet = DAG.getLoad(
8889 PtrMemVT, DL, Chain, DescAddr,
8891 Align(PtrMemVT.getSizeInBits() / 8),
8893 Chain = FuncTLVGet.getValue(1);
8894
8895 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8896 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8897
8899 MFI.setAdjustsStack(true);
8900
8901 // TLS calls preserve all registers except those that absolutely must be
8902 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8903 // silly).
8904 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8905 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8906 if (Subtarget->hasCustomCallingConv())
8907 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8908
8909 // Finally, we can make the call. This is just a degenerate version of a
8910 // normal AArch64 call node: x0 takes the address of the descriptor, and
8911 // returns the address of the variable in this thread.
8912 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8913 Chain =
8914 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8915 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8916 DAG.getRegisterMask(Mask), Chain.getValue(1));
8917 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8918}
8919
8920/// Convert a thread-local variable reference into a sequence of instructions to
8921/// compute the variable's address for the local exec TLS model of ELF targets.
8922/// The sequence depends on the maximum TLS area size.
8923SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8924 SDValue ThreadBase,
8925 const SDLoc &DL,
8926 SelectionDAG &DAG) const {
8927 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8928 SDValue TPOff, Addr;
8929
8930 switch (DAG.getTarget().Options.TLSSize) {
8931 default:
8932 llvm_unreachable("Unexpected TLS size");
8933
8934 case 12: {
8935 // mrs x0, TPIDR_EL0
8936 // add x0, x0, :tprel_lo12:a
8938 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8939 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8940 Var,
8941 DAG.getTargetConstant(0, DL, MVT::i32)),
8942 0);
8943 }
8944
8945 case 24: {
8946 // mrs x0, TPIDR_EL0
8947 // add x0, x0, :tprel_hi12:a
8948 // add x0, x0, :tprel_lo12_nc:a
8949 SDValue HiVar = DAG.getTargetGlobalAddress(
8950 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8951 SDValue LoVar = DAG.getTargetGlobalAddress(
8952 GV, DL, PtrVT, 0,
8954 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8955 HiVar,
8956 DAG.getTargetConstant(0, DL, MVT::i32)),
8957 0);
8958 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8959 LoVar,
8960 DAG.getTargetConstant(0, DL, MVT::i32)),
8961 0);
8962 }
8963
8964 case 32: {
8965 // mrs x1, TPIDR_EL0
8966 // movz x0, #:tprel_g1:a
8967 // movk x0, #:tprel_g0_nc:a
8968 // add x0, x1, x0
8969 SDValue HiVar = DAG.getTargetGlobalAddress(
8970 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8971 SDValue LoVar = DAG.getTargetGlobalAddress(
8972 GV, DL, PtrVT, 0,
8974 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8975 DAG.getTargetConstant(16, DL, MVT::i32)),
8976 0);
8977 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8978 DAG.getTargetConstant(0, DL, MVT::i32)),
8979 0);
8980 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8981 }
8982
8983 case 48: {
8984 // mrs x1, TPIDR_EL0
8985 // movz x0, #:tprel_g2:a
8986 // movk x0, #:tprel_g1_nc:a
8987 // movk x0, #:tprel_g0_nc:a
8988 // add x0, x1, x0
8989 SDValue HiVar = DAG.getTargetGlobalAddress(
8990 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8991 SDValue MiVar = DAG.getTargetGlobalAddress(
8992 GV, DL, PtrVT, 0,
8994 SDValue LoVar = DAG.getTargetGlobalAddress(
8995 GV, DL, PtrVT, 0,
8997 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8998 DAG.getTargetConstant(32, DL, MVT::i32)),
8999 0);
9000 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9001 DAG.getTargetConstant(16, DL, MVT::i32)),
9002 0);
9003 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9004 DAG.getTargetConstant(0, DL, MVT::i32)),
9005 0);
9006 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9007 }
9008 }
9009}
9010
9011/// When accessing thread-local variables under either the general-dynamic or
9012/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9013/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9014/// is a function pointer to carry out the resolution.
9015///
9016/// The sequence is:
9017/// adrp x0, :tlsdesc:var
9018/// ldr x1, [x0, #:tlsdesc_lo12:var]
9019/// add x0, x0, #:tlsdesc_lo12:var
9020/// .tlsdesccall var
9021/// blr x1
9022/// (TPIDR_EL0 offset now in x0)
9023///
9024/// The above sequence must be produced unscheduled, to enable the linker to
9025/// optimize/relax this sequence.
9026/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9027/// above sequence, and expanded really late in the compilation flow, to ensure
9028/// the sequence is produced as per above.
9029SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9030 const SDLoc &DL,
9031 SelectionDAG &DAG) const {
9032 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9033
9034 SDValue Chain = DAG.getEntryNode();
9035 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9036
9037 Chain =
9038 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9039 SDValue Glue = Chain.getValue(1);
9040
9041 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9042}
9043
9044SDValue
9045AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9046 SelectionDAG &DAG) const {
9047 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9048
9049 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9050
9052
9054 if (Model == TLSModel::LocalDynamic)
9056 }
9057
9059 Model != TLSModel::LocalExec)
9060 report_fatal_error("ELF TLS only supported in small memory model or "
9061 "in local exec TLS model");
9062 // Different choices can be made for the maximum size of the TLS area for a
9063 // module. For the small address model, the default TLS size is 16MiB and the
9064 // maximum TLS size is 4GiB.
9065 // FIXME: add tiny and large code model support for TLS access models other
9066 // than local exec. We currently generate the same code as small for tiny,
9067 // which may be larger than needed.
9068
9069 SDValue TPOff;
9070 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9071 SDLoc DL(Op);
9072 const GlobalValue *GV = GA->getGlobal();
9073
9074 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9075
9076 if (Model == TLSModel::LocalExec) {
9077 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9078 } else if (Model == TLSModel::InitialExec) {
9079 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9080 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9081 } else if (Model == TLSModel::LocalDynamic) {
9082 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9083 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9084 // the beginning of the module's TLS region, followed by a DTPREL offset
9085 // calculation.
9086
9087 // These accesses will need deduplicating if there's more than one.
9088 AArch64FunctionInfo *MFI =
9091
9092 // The call needs a relocation too for linker relaxation. It doesn't make
9093 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9094 // the address.
9095 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9097
9098 // Now we can calculate the offset from TPIDR_EL0 to this module's
9099 // thread-local area.
9100 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9101
9102 // Now use :dtprel_whatever: operations to calculate this variable's offset
9103 // in its thread-storage area.
9104 SDValue HiVar = DAG.getTargetGlobalAddress(
9105 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9106 SDValue LoVar = DAG.getTargetGlobalAddress(
9107 GV, DL, MVT::i64, 0,
9109
9110 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9111 DAG.getTargetConstant(0, DL, MVT::i32)),
9112 0);
9113 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9114 DAG.getTargetConstant(0, DL, MVT::i32)),
9115 0);
9116 } else if (Model == TLSModel::GeneralDynamic) {
9117 // The call needs a relocation too for linker relaxation. It doesn't make
9118 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9119 // the address.
9120 SDValue SymAddr =
9121 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9122
9123 // Finally we can make a call to calculate the offset from tpidr_el0.
9124 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9125 } else
9126 llvm_unreachable("Unsupported ELF TLS access model");
9127
9128 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9129}
9130
9131SDValue
9132AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9133 SelectionDAG &DAG) const {
9134 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9135
9136 SDValue Chain = DAG.getEntryNode();
9137 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9138 SDLoc DL(Op);
9139
9140 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9141
9142 // Load the ThreadLocalStoragePointer from the TEB
9143 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9144 SDValue TLSArray =
9145 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9146 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9147 Chain = TLSArray.getValue(1);
9148
9149 // Load the TLS index from the C runtime;
9150 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9151 // This also does the same as LOADgot, but using a generic i32 load,
9152 // while LOADgot only loads i64.
9153 SDValue TLSIndexHi =
9154 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9155 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9156 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9157 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9158 SDValue TLSIndex =
9159 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9160 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9161 Chain = TLSIndex.getValue(1);
9162
9163 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9164 // offset into the TLSArray.
9165 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9166 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9167 DAG.getConstant(3, DL, PtrVT));
9168 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9169 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9171 Chain = TLS.getValue(1);
9172
9173 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9174 const GlobalValue *GV = GA->getGlobal();
9175 SDValue TGAHi = DAG.getTargetGlobalAddress(
9176 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9177 SDValue TGALo = DAG.getTargetGlobalAddress(
9178 GV, DL, PtrVT, 0,
9180
9181 // Add the offset from the start of the .tls section (section base).
9182 SDValue Addr =
9183 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9184 DAG.getTargetConstant(0, DL, MVT::i32)),
9185 0);
9186 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9187 return Addr;
9188}
9189
9190SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9191 SelectionDAG &DAG) const {
9192 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9193 if (DAG.getTarget().useEmulatedTLS())
9194 return LowerToTLSEmulatedModel(GA, DAG);
9195
9196 if (Subtarget->isTargetDarwin())
9197 return LowerDarwinGlobalTLSAddress(Op, DAG);
9198 if (Subtarget->isTargetELF())
9199 return LowerELFGlobalTLSAddress(Op, DAG);
9200 if (Subtarget->isTargetWindows())
9201 return LowerWindowsGlobalTLSAddress(Op, DAG);
9202
9203 llvm_unreachable("Unexpected platform trying to use TLS");
9204}
9205
9206// Looks through \param Val to determine the bit that can be used to
9207// check the sign of the value. It returns the unextended value and
9208// the sign bit position.
9209std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9210 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9211 return {Val.getOperand(0),
9212 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9213 1};
9214
9215 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9216 return {Val.getOperand(0),
9217 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9218
9219 return {Val, Val.getValueSizeInBits() - 1};
9220}
9221
9222SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9223 SDValue Chain = Op.getOperand(0);
9224 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9225 SDValue LHS = Op.getOperand(2);
9226 SDValue RHS = Op.getOperand(3);
9227 SDValue Dest = Op.getOperand(4);
9228 SDLoc dl(Op);
9229
9231 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9232 // will not be produced, as they are conditional branch instructions that do
9233 // not set flags.
9234 bool ProduceNonFlagSettingCondBr =
9235 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9236
9237 // Handle f128 first, since lowering it will result in comparing the return
9238 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9239 // is expecting to deal with.
9240 if (LHS.getValueType() == MVT::f128) {
9241 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9242
9243 // If softenSetCCOperands returned a scalar, we need to compare the result
9244 // against zero to select between true and false values.
9245 if (!RHS.getNode()) {
9246 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9247 CC = ISD::SETNE;
9248 }
9249 }
9250
9251 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9252 // instruction.
9253 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9254 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9255 // Only lower legal XALUO ops.
9256 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9257 return SDValue();
9258
9259 // The actual operation with overflow check.
9261 SDValue Value, Overflow;
9262 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9263
9264 if (CC == ISD::SETNE)
9265 OFCC = getInvertedCondCode(OFCC);
9266 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9267
9268 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9269 Overflow);
9270 }
9271
9272 if (LHS.getValueType().isInteger()) {
9273 assert((LHS.getValueType() == RHS.getValueType()) &&
9274 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9275
9276 // If the RHS of the comparison is zero, we can potentially fold this
9277 // to a specialized branch.
9278 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9279 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9280 if (CC == ISD::SETEQ) {
9281 // See if we can use a TBZ to fold in an AND as well.
9282 // TBZ has a smaller branch displacement than CBZ. If the offset is
9283 // out of bounds, a late MI-layer pass rewrites branches.
9284 // 403.gcc is an example that hits this case.
9285 if (LHS.getOpcode() == ISD::AND &&
9286 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9287 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9288 SDValue Test = LHS.getOperand(0);
9289 uint64_t Mask = LHS.getConstantOperandVal(1);
9290 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9291 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9292 Dest);
9293 }
9294
9295 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9296 } else if (CC == ISD::SETNE) {
9297 // See if we can use a TBZ to fold in an AND as well.
9298 // TBZ has a smaller branch displacement than CBZ. If the offset is
9299 // out of bounds, a late MI-layer pass rewrites branches.
9300 // 403.gcc is an example that hits this case.
9301 if (LHS.getOpcode() == ISD::AND &&
9302 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9303 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9304 SDValue Test = LHS.getOperand(0);
9305 uint64_t Mask = LHS.getConstantOperandVal(1);
9306 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9307 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9308 Dest);
9309 }
9310
9311 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9312 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9313 // Don't combine AND since emitComparison converts the AND to an ANDS
9314 // (a.k.a. TST) and the test in the test bit and branch instruction
9315 // becomes redundant. This would also increase register pressure.
9316 uint64_t SignBitPos;
9317 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9318 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9319 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9320 }
9321 }
9322 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9323 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9324 // Don't combine AND since emitComparison converts the AND to an ANDS
9325 // (a.k.a. TST) and the test in the test bit and branch instruction
9326 // becomes redundant. This would also increase register pressure.
9327 uint64_t SignBitPos;
9328 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9329 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9330 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9331 }
9332
9333 SDValue CCVal;
9334 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9335 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9336 Cmp);
9337 }
9338
9339 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9340 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9341
9342 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9343 // clean. Some of them require two branches to implement.
9344 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9345 AArch64CC::CondCode CC1, CC2;
9346 changeFPCCToAArch64CC(CC, CC1, CC2);
9347 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9348 SDValue BR1 =
9349 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9350 if (CC2 != AArch64CC::AL) {
9351 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9352 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9353 Cmp);
9354 }
9355
9356 return BR1;
9357}
9358
9359SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9360 SelectionDAG &DAG) const {
9361 if (!Subtarget->hasNEON())
9362 return SDValue();
9363
9364 EVT VT = Op.getValueType();
9365 EVT IntVT = VT.changeTypeToInteger();
9366 SDLoc DL(Op);
9367
9368 SDValue In1 = Op.getOperand(0);
9369 SDValue In2 = Op.getOperand(1);
9370 EVT SrcVT = In2.getValueType();
9371
9372 if (!SrcVT.bitsEq(VT))
9373 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9374
9375 if (VT.isScalableVector())
9376 IntVT =
9378
9379 if (VT.isFixedLengthVector() &&
9380 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9381 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9382
9383 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9384 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9385
9386 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9387 return convertFromScalableVector(DAG, VT, Res);
9388 }
9389
9390 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9391 if (VT.isScalableVector())
9392 return getSVESafeBitCast(VT, Op, DAG);
9393
9394 return DAG.getBitcast(VT, Op);
9395 };
9396
9397 SDValue VecVal1, VecVal2;
9398 EVT VecVT;
9399 auto SetVecVal = [&](int Idx = -1) {
9400 if (!VT.isVector()) {
9401 VecVal1 =
9402 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9403 VecVal2 =
9404 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9405 } else {
9406 VecVal1 = BitCast(VecVT, In1, DAG);
9407 VecVal2 = BitCast(VecVT, In2, DAG);
9408 }
9409 };
9410 if (VT.isVector()) {
9411 VecVT = IntVT;
9412 SetVecVal();
9413 } else if (VT == MVT::f64) {
9414 VecVT = MVT::v2i64;
9415 SetVecVal(AArch64::dsub);
9416 } else if (VT == MVT::f32) {
9417 VecVT = MVT::v4i32;
9418 SetVecVal(AArch64::ssub);
9419 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9420 VecVT = MVT::v8i16;
9421 SetVecVal(AArch64::hsub);
9422 } else {
9423 llvm_unreachable("Invalid type for copysign!");
9424 }
9425
9426 unsigned BitWidth = In1.getScalarValueSizeInBits();
9427 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9428
9429 // We want to materialize a mask with every bit but the high bit set, but the
9430 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9431 // 64-bit elements. Instead, materialize all bits set and then negate that.
9432 if (VT == MVT::f64 || VT == MVT::v2f64) {
9433 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9434 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9435 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9436 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9437 }
9438
9439 SDValue BSP =
9440 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9441 if (VT == MVT::f16 || VT == MVT::bf16)
9442 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9443 if (VT == MVT::f32)
9444 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9445 if (VT == MVT::f64)
9446 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9447
9448 return BitCast(VT, BSP, DAG);
9449}
9450
9451SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9452 SelectionDAG &DAG) const {
9454 Attribute::NoImplicitFloat))
9455 return SDValue();
9456
9457 if (!Subtarget->hasNEON())
9458 return SDValue();
9459
9460 bool IsParity = Op.getOpcode() == ISD::PARITY;
9461 SDValue Val = Op.getOperand(0);
9462 SDLoc DL(Op);
9463 EVT VT = Op.getValueType();
9464
9465 // for i32, general parity function using EORs is more efficient compared to
9466 // using floating point
9467 if (VT == MVT::i32 && IsParity)
9468 return SDValue();
9469
9470 // If there is no CNT instruction available, GPR popcount can
9471 // be more efficiently lowered to the following sequence that uses
9472 // AdvSIMD registers/instructions as long as the copies to/from
9473 // the AdvSIMD registers are cheap.
9474 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9475 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9476 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9477 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9478 if (VT == MVT::i32 || VT == MVT::i64) {
9479 if (VT == MVT::i32)
9480 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9481 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9482
9483 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9484 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9485 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9486 DAG.getConstant(0, DL, MVT::i64));
9487
9488 if (IsParity)
9489 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9490 DAG.getConstant(1, DL, MVT::i32));
9491
9492 if (VT == MVT::i64)
9493 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9494 return UaddLV;
9495 } else if (VT == MVT::i128) {
9496 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9497
9498 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9499 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9500 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9501 DAG.getConstant(0, DL, MVT::i64));
9502
9503 if (IsParity)
9504 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9505 DAG.getConstant(1, DL, MVT::i32));
9506
9507 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9508 }
9509
9510 assert(!IsParity && "ISD::PARITY of vector types not supported");
9511
9512 if (VT.isScalableVector() ||
9514 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9515
9516 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9517 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9518 "Unexpected type for custom ctpop lowering");
9519
9520 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9521 Val = DAG.getBitcast(VT8Bit, Val);
9522 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9523
9524 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9525 unsigned EltSize = 8;
9526 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9527 while (EltSize != VT.getScalarSizeInBits()) {
9528 EltSize *= 2;
9529 NumElts /= 2;
9530 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9531 Val = DAG.getNode(
9532 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9533 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9534 }
9535
9536 return Val;
9537}
9538
9539SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9540 EVT VT = Op.getValueType();
9541 assert(VT.isScalableVector() ||
9543 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9544
9545 SDLoc DL(Op);
9546 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9547 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9548}
9549
9550SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9551 SelectionDAG &DAG) const {
9552
9553 EVT VT = Op.getValueType();
9554 SDLoc DL(Op);
9555 unsigned Opcode = Op.getOpcode();
9557 switch (Opcode) {
9558 default:
9559 llvm_unreachable("Wrong instruction");
9560 case ISD::SMAX:
9561 CC = ISD::SETGT;
9562 break;
9563 case ISD::SMIN:
9564 CC = ISD::SETLT;
9565 break;
9566 case ISD::UMAX:
9567 CC = ISD::SETUGT;
9568 break;
9569 case ISD::UMIN:
9570 CC = ISD::SETULT;
9571 break;
9572 }
9573
9574 if (VT.isScalableVector() ||
9576 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9577 switch (Opcode) {
9578 default:
9579 llvm_unreachable("Wrong instruction");
9580 case ISD::SMAX:
9581 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9582 case ISD::SMIN:
9583 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9584 case ISD::UMAX:
9585 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9586 case ISD::UMIN:
9587 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9588 }
9589 }
9590
9591 SDValue Op0 = Op.getOperand(0);
9592 SDValue Op1 = Op.getOperand(1);
9593 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9594 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9595}
9596
9597SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9598 SelectionDAG &DAG) const {
9599 EVT VT = Op.getValueType();
9600
9601 if (VT.isScalableVector() ||
9603 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9604 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9605
9606 SDLoc DL(Op);
9607 SDValue REVB;
9608 MVT VST;
9609
9610 switch (VT.getSimpleVT().SimpleTy) {
9611 default:
9612 llvm_unreachable("Invalid type for bitreverse!");
9613
9614 case MVT::v2i32: {
9615 VST = MVT::v8i8;
9616 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9617
9618 break;
9619 }
9620
9621 case MVT::v4i32: {
9622 VST = MVT::v16i8;
9623 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9624
9625 break;
9626 }
9627
9628 case MVT::v1i64: {
9629 VST = MVT::v8i8;
9630 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9631
9632 break;
9633 }
9634
9635 case MVT::v2i64: {
9636 VST = MVT::v16i8;
9637 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9638
9639 break;
9640 }
9641 }
9642
9643 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9644 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9645}
9646
9647// Check whether the continuous comparison sequence.
9648static bool
9649isOrXorChain(SDValue N, unsigned &Num,
9650 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9651 if (Num == MaxXors)
9652 return false;
9653
9654 // Skip the one-use zext
9655 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9656 N = N->getOperand(0);
9657
9658 // The leaf node must be XOR
9659 if (N->getOpcode() == ISD::XOR) {
9660 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9661 Num++;
9662 return true;
9663 }
9664
9665 // All the non-leaf nodes must be OR.
9666 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9667 return false;
9668
9669 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9670 isOrXorChain(N->getOperand(1), Num, WorkList))
9671 return true;
9672 return false;
9673}
9674
9675// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9677 SDValue LHS = N->getOperand(0);
9678 SDValue RHS = N->getOperand(1);
9679 SDLoc DL(N);
9680 EVT VT = N->getValueType(0);
9682
9683 // Only handle integer compares.
9684 if (N->getOpcode() != ISD::SETCC)
9685 return SDValue();
9686
9687 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9688 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9689 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9690 unsigned NumXors = 0;
9691 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9692 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9693 isOrXorChain(LHS, NumXors, WorkList)) {
9694 SDValue XOR0, XOR1;
9695 std::tie(XOR0, XOR1) = WorkList[0];
9696 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9697 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9698 for (unsigned I = 1; I < WorkList.size(); I++) {
9699 std::tie(XOR0, XOR1) = WorkList[I];
9700 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9701 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9702 }
9703
9704 // Exit early by inverting the condition, which help reduce indentations.
9705 return Cmp;
9706 }
9707
9708 return SDValue();
9709}
9710
9711SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9712
9713 if (Op.getValueType().isVector())
9714 return LowerVSETCC(Op, DAG);
9715
9716 bool IsStrict = Op->isStrictFPOpcode();
9717 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9718 unsigned OpNo = IsStrict ? 1 : 0;
9719 SDValue Chain;
9720 if (IsStrict)
9721 Chain = Op.getOperand(0);
9722 SDValue LHS = Op.getOperand(OpNo + 0);
9723 SDValue RHS = Op.getOperand(OpNo + 1);
9724 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9725 SDLoc dl(Op);
9726
9727 // We chose ZeroOrOneBooleanContents, so use zero and one.
9728 EVT VT = Op.getValueType();
9729 SDValue TVal = DAG.getConstant(1, dl, VT);
9730 SDValue FVal = DAG.getConstant(0, dl, VT);
9731
9732 // Handle f128 first, since one possible outcome is a normal integer
9733 // comparison which gets picked up by the next if statement.
9734 if (LHS.getValueType() == MVT::f128) {
9735 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9736 IsSignaling);
9737
9738 // If softenSetCCOperands returned a scalar, use it.
9739 if (!RHS.getNode()) {
9740 assert(LHS.getValueType() == Op.getValueType() &&
9741 "Unexpected setcc expansion!");
9742 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9743 }
9744 }
9745
9746 if (LHS.getValueType().isInteger()) {
9747 SDValue CCVal;
9749 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9750
9751 // Note that we inverted the condition above, so we reverse the order of
9752 // the true and false operands here. This will allow the setcc to be
9753 // matched to a single CSINC instruction.
9754 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9755 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9756 }
9757
9758 // Now we know we're dealing with FP values.
9759 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9760 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9761
9762 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9763 // and do the comparison.
9764 SDValue Cmp;
9765 if (IsStrict)
9766 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9767 else
9768 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9769
9770 AArch64CC::CondCode CC1, CC2;
9771 changeFPCCToAArch64CC(CC, CC1, CC2);
9772 SDValue Res;
9773 if (CC2 == AArch64CC::AL) {
9774 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9775 CC2);
9776 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9777
9778 // Note that we inverted the condition above, so we reverse the order of
9779 // the true and false operands here. This will allow the setcc to be
9780 // matched to a single CSINC instruction.
9781 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9782 } else {
9783 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9784 // totally clean. Some of them require two CSELs to implement. As is in
9785 // this case, we emit the first CSEL and then emit a second using the output
9786 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9787
9788 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9789 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9790 SDValue CS1 =
9791 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9792
9793 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9794 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9795 }
9796 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9797}
9798
9799SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9800 SelectionDAG &DAG) const {
9801
9802 SDValue LHS = Op.getOperand(0);
9803 SDValue RHS = Op.getOperand(1);
9804 EVT VT = LHS.getValueType();
9805 if (VT != MVT::i32 && VT != MVT::i64)
9806 return SDValue();
9807
9808 SDLoc DL(Op);
9809 SDValue Carry = Op.getOperand(2);
9810 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9811 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9812 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9813 LHS, RHS, InvCarry);
9814
9815 EVT OpVT = Op.getValueType();
9816 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9817 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9818
9819 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9821 SDValue CCVal =
9822 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9823 // Inputs are swapped because the condition is inverted. This will allow
9824 // matching with a single CSINC instruction.
9825 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9826 Cmp.getValue(1));
9827}
9828
9829SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9830 SDValue RHS, SDValue TVal,
9831 SDValue FVal, const SDLoc &dl,
9832 SelectionDAG &DAG) const {
9833 // Handle f128 first, because it will result in a comparison of some RTLIB
9834 // call result against zero.
9835 if (LHS.getValueType() == MVT::f128) {
9836 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9837
9838 // If softenSetCCOperands returned a scalar, we need to compare the result
9839 // against zero to select between true and false values.
9840 if (!RHS.getNode()) {
9841 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9842 CC = ISD::SETNE;
9843 }
9844 }
9845
9846 // Also handle f16, for which we need to do a f32 comparison.
9847 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9848 LHS.getValueType() == MVT::bf16) {
9849 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9850 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9851 }
9852
9853 // Next, handle integers.
9854 if (LHS.getValueType().isInteger()) {
9855 assert((LHS.getValueType() == RHS.getValueType()) &&
9856 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9857
9858 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9859 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9860 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9861 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9862 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9863 // supported types.
9864 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9865 CTVal->isOne() && CFVal->isAllOnes() &&
9866 LHS.getValueType() == TVal.getValueType()) {
9867 EVT VT = LHS.getValueType();
9868 SDValue Shift =
9869 DAG.getNode(ISD::SRA, dl, VT, LHS,
9870 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9871 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9872 }
9873
9874 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9875 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9876 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9877 // Both require less instructions than compare and conditional select.
9878 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9879 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9880 LHS.getValueType() == RHS.getValueType()) {
9881 EVT VT = LHS.getValueType();
9882 SDValue Shift =
9883 DAG.getNode(ISD::SRA, dl, VT, LHS,
9884 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9885
9886 if (CC == ISD::SETGT)
9887 Shift = DAG.getNOT(dl, Shift, VT);
9888
9889 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9890 }
9891
9892 unsigned Opcode = AArch64ISD::CSEL;
9893
9894 // If both the TVal and the FVal are constants, see if we can swap them in
9895 // order to for a CSINV or CSINC out of them.
9896 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9897 std::swap(TVal, FVal);
9898 std::swap(CTVal, CFVal);
9899 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9900 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9901 std::swap(TVal, FVal);
9902 std::swap(CTVal, CFVal);
9903 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9904 } else if (TVal.getOpcode() == ISD::XOR) {
9905 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9906 // with a CSINV rather than a CSEL.
9907 if (isAllOnesConstant(TVal.getOperand(1))) {
9908 std::swap(TVal, FVal);
9909 std::swap(CTVal, CFVal);
9910 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9911 }
9912 } else if (TVal.getOpcode() == ISD::SUB) {
9913 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9914 // that we can match with a CSNEG rather than a CSEL.
9915 if (isNullConstant(TVal.getOperand(0))) {
9916 std::swap(TVal, FVal);
9917 std::swap(CTVal, CFVal);
9918 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9919 }
9920 } else if (CTVal && CFVal) {
9921 const int64_t TrueVal = CTVal->getSExtValue();
9922 const int64_t FalseVal = CFVal->getSExtValue();
9923 bool Swap = false;
9924
9925 // If both TVal and FVal are constants, see if FVal is the
9926 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9927 // instead of a CSEL in that case.
9928 if (TrueVal == ~FalseVal) {
9929 Opcode = AArch64ISD::CSINV;
9930 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9931 TrueVal == -FalseVal) {
9932 Opcode = AArch64ISD::CSNEG;
9933 } else if (TVal.getValueType() == MVT::i32) {
9934 // If our operands are only 32-bit wide, make sure we use 32-bit
9935 // arithmetic for the check whether we can use CSINC. This ensures that
9936 // the addition in the check will wrap around properly in case there is
9937 // an overflow (which would not be the case if we do the check with
9938 // 64-bit arithmetic).
9939 const uint32_t TrueVal32 = CTVal->getZExtValue();
9940 const uint32_t FalseVal32 = CFVal->getZExtValue();
9941
9942 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9943 Opcode = AArch64ISD::CSINC;
9944
9945 if (TrueVal32 > FalseVal32) {
9946 Swap = true;
9947 }
9948 }
9949 } else {
9950 // 64-bit check whether we can use CSINC.
9951 const uint64_t TrueVal64 = TrueVal;
9952 const uint64_t FalseVal64 = FalseVal;
9953
9954 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9955 Opcode = AArch64ISD::CSINC;
9956
9957 if (TrueVal > FalseVal) {
9958 Swap = true;
9959 }
9960 }
9961 }
9962
9963 // Swap TVal and FVal if necessary.
9964 if (Swap) {
9965 std::swap(TVal, FVal);
9966 std::swap(CTVal, CFVal);
9967 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9968 }
9969
9970 if (Opcode != AArch64ISD::CSEL) {
9971 // Drop FVal since we can get its value by simply inverting/negating
9972 // TVal.
9973 FVal = TVal;
9974 }
9975 }
9976
9977 // Avoid materializing a constant when possible by reusing a known value in
9978 // a register. However, don't perform this optimization if the known value
9979 // is one, zero or negative one in the case of a CSEL. We can always
9980 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9981 // FVal, respectively.
9982 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9983 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9984 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9986 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9987 // "a != C ? x : a" to avoid materializing C.
9988 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9989 TVal = LHS;
9990 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9991 FVal = LHS;
9992 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9993 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9994 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9995 // avoid materializing C.
9997 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9998 Opcode = AArch64ISD::CSINV;
9999 TVal = LHS;
10000 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10001 }
10002 }
10003
10004 SDValue CCVal;
10005 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10006 EVT VT = TVal.getValueType();
10007 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10008 }
10009
10010 // Now we know we're dealing with FP values.
10011 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10012 LHS.getValueType() == MVT::f64);
10013 assert(LHS.getValueType() == RHS.getValueType());
10014 EVT VT = TVal.getValueType();
10015 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10016
10017 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10018 // clean. Some of them require two CSELs to implement.
10019 AArch64CC::CondCode CC1, CC2;
10020 changeFPCCToAArch64CC(CC, CC1, CC2);
10021
10022 if (DAG.getTarget().Options.UnsafeFPMath) {
10023 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10024 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10025 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10026 if (RHSVal && RHSVal->isZero()) {
10027 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10028 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10029
10030 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10031 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10032 TVal = LHS;
10033 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10034 CFVal && CFVal->isZero() &&
10035 FVal.getValueType() == LHS.getValueType())
10036 FVal = LHS;
10037 }
10038 }
10039
10040 // Emit first, and possibly only, CSEL.
10041 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10042 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10043
10044 // If we need a second CSEL, emit it, using the output of the first as the
10045 // RHS. We're effectively OR'ing the two CC's together.
10046 if (CC2 != AArch64CC::AL) {
10047 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10048 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10049 }
10050
10051 // Otherwise, return the output of the first CSEL.
10052 return CS1;
10053}
10054
10055SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10056 SelectionDAG &DAG) const {
10057 EVT Ty = Op.getValueType();
10058 auto Idx = Op.getConstantOperandAPInt(2);
10059 int64_t IdxVal = Idx.getSExtValue();
10060 assert(Ty.isScalableVector() &&
10061 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10062
10063 // We can use the splice instruction for certain index values where we are
10064 // able to efficiently generate the correct predicate. The index will be
10065 // inverted and used directly as the input to the ptrue instruction, i.e.
10066 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10067 // splice predicate. However, we can only do this if we can guarantee that
10068 // there are enough elements in the vector, hence we check the index <= min
10069 // number of elements.
10070 std::optional<unsigned> PredPattern;
10071 if (Ty.isScalableVector() && IdxVal < 0 &&
10072 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10073 std::nullopt) {
10074 SDLoc DL(Op);
10075
10076 // Create a predicate where all but the last -IdxVal elements are false.
10077 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10078 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10079 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10080
10081 // Now splice the two inputs together using the predicate.
10082 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10083 Op.getOperand(1));
10084 }
10085
10086 // This will select to an EXT instruction, which has a maximum immediate
10087 // value of 255, hence 2048-bits is the maximum value we can lower.
10088 if (IdxVal >= 0 &&
10089 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10090 return Op;
10091
10092 return SDValue();
10093}
10094
10095SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10096 SelectionDAG &DAG) const {
10097 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10098 SDValue LHS = Op.getOperand(0);
10099 SDValue RHS = Op.getOperand(1);
10100 SDValue TVal = Op.getOperand(2);
10101 SDValue FVal = Op.getOperand(3);
10102 SDLoc DL(Op);
10103 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10104}
10105
10106SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10107 SelectionDAG &DAG) const {
10108 SDValue CCVal = Op->getOperand(0);
10109 SDValue TVal = Op->getOperand(1);
10110 SDValue FVal = Op->getOperand(2);
10111 SDLoc DL(Op);
10112
10113 EVT Ty = Op.getValueType();
10114 if (Ty == MVT::aarch64svcount) {
10115 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10116 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10117 SDValue Sel =
10118 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10119 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10120 }
10121
10122 if (Ty.isScalableVector()) {
10123 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10124 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10125 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10126 }
10127
10128 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10129 // FIXME: Ideally this would be the same as above using i1 types, however
10130 // for the moment we can't deal with fixed i1 vector types properly, so
10131 // instead extend the predicate to a result type sized integer vector.
10132 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10133 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10134 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10135 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10136 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10137 }
10138
10139 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10140 // instruction.
10141 if (ISD::isOverflowIntrOpRes(CCVal)) {
10142 // Only lower legal XALUO ops.
10143 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10144 return SDValue();
10145
10147 SDValue Value, Overflow;
10148 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10149 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10150
10151 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10152 CCVal, Overflow);
10153 }
10154
10155 // Lower it the same way as we would lower a SELECT_CC node.
10157 SDValue LHS, RHS;
10158 if (CCVal.getOpcode() == ISD::SETCC) {
10159 LHS = CCVal.getOperand(0);
10160 RHS = CCVal.getOperand(1);
10161 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10162 } else {
10163 LHS = CCVal;
10164 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10165 CC = ISD::SETNE;
10166 }
10167
10168 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10169 // order to use FCSELSrrr
10170 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10171 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10172 DAG.getUNDEF(MVT::f32), TVal);
10173 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10174 DAG.getUNDEF(MVT::f32), FVal);
10175 }
10176
10177 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10178
10179 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10180 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10181 }
10182
10183 return Res;
10184}
10185
10186SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10187 SelectionDAG &DAG) const {
10188 // Jump table entries as PC relative offsets. No additional tweaking
10189 // is necessary here. Just get the address of the jump table.
10190 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10191
10194 !Subtarget->isTargetMachO())
10195 return getAddrLarge(JT, DAG);
10196 if (CM == CodeModel::Tiny)
10197 return getAddrTiny(JT, DAG);
10198 return getAddr(JT, DAG);
10199}
10200
10201SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10202 SelectionDAG &DAG) const {
10203 // Jump table entries as PC relative offsets. No additional tweaking
10204 // is necessary here. Just get the address of the jump table.
10205 SDLoc DL(Op);
10206 SDValue JT = Op.getOperand(1);
10207 SDValue Entry = Op.getOperand(2);
10208 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10209
10210 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10211 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10212
10213 SDNode *Dest =
10214 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10215 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10216 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10217 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10218}
10219
10220SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10221 SelectionDAG &DAG) const {
10222 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10224 if (CM == CodeModel::Large) {
10225 // Use the GOT for the large code model on iOS.
10226 if (Subtarget->isTargetMachO()) {
10227 return getGOT(CP, DAG);
10228 }
10230 return getAddrLarge(CP, DAG);
10231 } else if (CM == CodeModel::Tiny) {
10232 return getAddrTiny(CP, DAG);
10233 }
10234 return getAddr(CP, DAG);
10235}
10236
10237SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10238 SelectionDAG &DAG) const {
10239 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10241 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10243 return getAddrLarge(BA, DAG);
10244 } else if (CM == CodeModel::Tiny) {
10245 return getAddrTiny(BA, DAG);
10246 }
10247 return getAddr(BA, DAG);
10248}
10249
10250SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10251 SelectionDAG &DAG) const {
10252 AArch64FunctionInfo *FuncInfo =
10254
10255 SDLoc DL(Op);
10256 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10258 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10259 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10260 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10261 MachinePointerInfo(SV));
10262}
10263
10264SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10265 SelectionDAG &DAG) const {
10268
10269 SDLoc DL(Op);
10270 SDValue FR;
10271 if (Subtarget->isWindowsArm64EC()) {
10272 // With the Arm64EC ABI, we compute the address of the varargs save area
10273 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10274 // but calls from an entry thunk can pass in a different address.
10275 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10276 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10278 if (FuncInfo->getVarArgsGPRSize() > 0)
10279 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10280 else
10281 StackOffset = FuncInfo->getVarArgsStackOffset();
10282 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10283 DAG.getConstant(StackOffset, DL, MVT::i64));
10284 } else {
10285 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10286 ? FuncInfo->getVarArgsGPRIndex()
10287 : FuncInfo->getVarArgsStackIndex(),
10289 }
10290 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10291 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10292 MachinePointerInfo(SV));
10293}
10294
10295SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10296 SelectionDAG &DAG) const {
10297 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10298 // Standard, section B.3.
10301 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10302 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10303 auto PtrVT = getPointerTy(DAG.getDataLayout());
10304 SDLoc DL(Op);
10305
10306 SDValue Chain = Op.getOperand(0);
10307 SDValue VAList = Op.getOperand(1);
10308 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10310
10311 // void *__stack at offset 0
10312 unsigned Offset = 0;
10313 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10314 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10315 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10316 MachinePointerInfo(SV), Align(PtrSize)));
10317
10318 // void *__gr_top at offset 8 (4 on ILP32)
10319 Offset += PtrSize;
10320 int GPRSize = FuncInfo->getVarArgsGPRSize();
10321 if (GPRSize > 0) {
10322 SDValue GRTop, GRTopAddr;
10323
10324 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10325 DAG.getConstant(Offset, DL, PtrVT));
10326
10327 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10328 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10329 DAG.getConstant(GPRSize, DL, PtrVT));
10330 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10331
10332 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10334 Align(PtrSize)));
10335 }
10336
10337 // void *__vr_top at offset 16 (8 on ILP32)
10338 Offset += PtrSize;
10339 int FPRSize = FuncInfo->getVarArgsFPRSize();
10340 if (FPRSize > 0) {
10341 SDValue VRTop, VRTopAddr;
10342 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10343 DAG.getConstant(Offset, DL, PtrVT));
10344
10345 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10346 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10347 DAG.getConstant(FPRSize, DL, PtrVT));
10348 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10349
10350 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10352 Align(PtrSize)));
10353 }
10354
10355 // int __gr_offs at offset 24 (12 on ILP32)
10356 Offset += PtrSize;
10357 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10358 DAG.getConstant(Offset, DL, PtrVT));
10359 MemOps.push_back(
10360 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10361 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10362
10363 // int __vr_offs at offset 28 (16 on ILP32)
10364 Offset += 4;
10365 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10366 DAG.getConstant(Offset, DL, PtrVT));
10367 MemOps.push_back(
10368 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10369 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10370
10371 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10372}
10373
10374SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10375 SelectionDAG &DAG) const {
10377
10378 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10379 return LowerWin64_VASTART(Op, DAG);
10380 else if (Subtarget->isTargetDarwin())
10381 return LowerDarwin_VASTART(Op, DAG);
10382 else
10383 return LowerAAPCS_VASTART(Op, DAG);
10384}
10385
10386SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10387 SelectionDAG &DAG) const {
10388 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10389 // pointer.
10390 SDLoc DL(Op);
10391 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10392 unsigned VaListSize =
10393 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10394 ? PtrSize
10395 : Subtarget->isTargetILP32() ? 20 : 32;
10396 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10397 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10398
10399 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10400 DAG.getConstant(VaListSize, DL, MVT::i32),
10401 Align(PtrSize), false, false, false,
10402 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10403}
10404
10405SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10406 assert(Subtarget->isTargetDarwin() &&
10407 "automatic va_arg instruction only works on Darwin");
10408
10409 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10410 EVT VT = Op.getValueType();
10411 SDLoc DL(Op);
10412 SDValue Chain = Op.getOperand(0);
10413 SDValue Addr = Op.getOperand(1);
10414 MaybeAlign Align(Op.getConstantOperandVal(3));
10415 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10416 auto PtrVT = getPointerTy(DAG.getDataLayout());
10417 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10418 SDValue VAList =
10419 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10420 Chain = VAList.getValue(1);
10421 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10422
10423 if (VT.isScalableVector())
10424 report_fatal_error("Passing SVE types to variadic functions is "
10425 "currently not supported");
10426
10427 if (Align && *Align > MinSlotSize) {
10428 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10429 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10430 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10431 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10432 }
10433
10434 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10435 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10436
10437 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10438 // up to 64 bits. At the very least, we have to increase the striding of the
10439 // vaargs list to match this, and for FP values we need to introduce
10440 // FP_ROUND nodes as well.
10441 if (VT.isInteger() && !VT.isVector())
10442 ArgSize = std::max(ArgSize, MinSlotSize);
10443 bool NeedFPTrunc = false;
10444 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10445 ArgSize = 8;
10446 NeedFPTrunc = true;
10447 }
10448
10449 // Increment the pointer, VAList, to the next vaarg
10450 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10451 DAG.getConstant(ArgSize, DL, PtrVT));
10452 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10453
10454 // Store the incremented VAList to the legalized pointer
10455 SDValue APStore =
10456 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10457
10458 // Load the actual argument out of the pointer VAList
10459 if (NeedFPTrunc) {
10460 // Load the value as an f64.
10461 SDValue WideFP =
10462 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10463 // Round the value down to an f32.
10464 SDValue NarrowFP =
10465 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10466 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10467 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10468 // Merge the rounded value with the chain output of the load.
10469 return DAG.getMergeValues(Ops, DL);
10470 }
10471
10472 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10473}
10474
10475SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10476 SelectionDAG &DAG) const {
10478 MFI.setFrameAddressIsTaken(true);
10479
10480 EVT VT = Op.getValueType();
10481 SDLoc DL(Op);
10482 unsigned Depth = Op.getConstantOperandVal(0);
10483 SDValue FrameAddr =
10484 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10485 while (Depth--)
10486 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10488
10489 if (Subtarget->isTargetILP32())
10490 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10491 DAG.getValueType(VT));
10492
10493 return FrameAddr;
10494}
10495
10496SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10497 SelectionDAG &DAG) const {
10499
10500 EVT VT = getPointerTy(DAG.getDataLayout());
10501 SDLoc DL(Op);
10502 int FI = MFI.CreateFixedObject(4, 0, false);
10503 return DAG.getFrameIndex(FI, VT);
10504}
10505
10506#define GET_REGISTER_MATCHER
10507#include "AArch64GenAsmMatcher.inc"
10508
10509// FIXME? Maybe this could be a TableGen attribute on some registers and
10510// this table could be generated automatically from RegInfo.
10511Register AArch64TargetLowering::
10512getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10514 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10515 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10516 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10517 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10518 !MRI->isReservedReg(MF, Reg))
10519 Reg = 0;
10520 }
10521 if (Reg)
10522 return Reg;
10523 report_fatal_error(Twine("Invalid register name \""
10524 + StringRef(RegName) + "\"."));
10525}
10526
10527SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10528 SelectionDAG &DAG) const {
10530
10531 EVT VT = Op.getValueType();
10532 SDLoc DL(Op);
10533
10534 SDValue FrameAddr =
10535 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10537
10538 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10539}
10540
10541SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10542 SelectionDAG &DAG) const {
10544 MachineFrameInfo &MFI = MF.getFrameInfo();
10545 MFI.setReturnAddressIsTaken(true);
10546
10547 EVT VT = Op.getValueType();
10548 SDLoc DL(Op);
10549 unsigned Depth = Op.getConstantOperandVal(0);
10550 SDValue ReturnAddress;
10551 if (Depth) {
10552 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10554 ReturnAddress = DAG.getLoad(
10555 VT, DL, DAG.getEntryNode(),
10556 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10557 } else {
10558 // Return LR, which contains the return address. Mark it an implicit
10559 // live-in.
10560 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10561 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10562 }
10563
10564 // The XPACLRI instruction assembles to a hint-space instruction before
10565 // Armv8.3-A therefore this instruction can be safely used for any pre
10566 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10567 // that instead.
10568 SDNode *St;
10569 if (Subtarget->hasPAuth()) {
10570 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10571 } else {
10572 // XPACLRI operates on LR therefore we must move the operand accordingly.
10573 SDValue Chain =
10574 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10575 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10576 }
10577 return SDValue(St, 0);
10578}
10579
10580/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10581/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10582SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10583 SelectionDAG &DAG) const {
10584 SDValue Lo, Hi;
10585 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10586 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10587}
10588
10590 const GlobalAddressSDNode *GA) const {
10591 // Offsets are folded in the DAG combine rather than here so that we can
10592 // intelligently choose an offset based on the uses.
10593 return false;
10594}
10595
10597 bool OptForSize) const {
10598 bool IsLegal = false;
10599 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10600 // 16-bit case when target has full fp16 support.
10601 // We encode bf16 bit patterns as if they were fp16. This results in very
10602 // strange looking assembly but should populate the register with appropriate
10603 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10604 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10605 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10606 // FIXME: We should be able to handle f128 as well with a clever lowering.
10607 const APInt ImmInt = Imm.bitcastToAPInt();
10608 if (VT == MVT::f64)
10609 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10610 else if (VT == MVT::f32)
10611 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10612 else if (VT == MVT::f16 || VT == MVT::bf16)
10613 IsLegal =
10614 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10615 Imm.isPosZero();
10616
10617 // If we can not materialize in immediate field for fmov, check if the
10618 // value can be encoded as the immediate operand of a logical instruction.
10619 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10620 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10621 // generate that fmov.
10622 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10623 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10624 // however the mov+fmov sequence is always better because of the reduced
10625 // cache pressure. The timings are still the same if you consider
10626 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10627 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10630 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10631 IsLegal = Insn.size() <= Limit;
10632 }
10633
10634 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10635 << " imm value: "; Imm.dump(););
10636 return IsLegal;
10637}
10638
10639//===----------------------------------------------------------------------===//
10640// AArch64 Optimization Hooks
10641//===----------------------------------------------------------------------===//
10642
10643static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10644 SDValue Operand, SelectionDAG &DAG,
10645 int &ExtraSteps) {
10646 EVT VT = Operand.getValueType();
10647 if ((ST->hasNEON() &&
10648 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10649 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10650 VT == MVT::v4f32)) ||
10651 (ST->hasSVE() &&
10652 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10654 // For the reciprocal estimates, convergence is quadratic, so the number
10655 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10656 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10657 // the result for float (23 mantissa bits) is 2 and for double (52
10658 // mantissa bits) is 3.
10659 constexpr unsigned AccurateBits = 8;
10660 unsigned DesiredBits =
10662 ExtraSteps = DesiredBits <= AccurateBits
10663 ? 0
10664 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10665 }
10666
10667 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10668 }
10669
10670 return SDValue();
10671}
10672
10673SDValue
10674AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10675 const DenormalMode &Mode) const {
10676 SDLoc DL(Op);
10677 EVT VT = Op.getValueType();
10678 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10679 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10680 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10681}
10682
10683SDValue
10684AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10685 SelectionDAG &DAG) const {
10686 return Op;
10687}
10688
10689SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10690 SelectionDAG &DAG, int Enabled,
10691 int &ExtraSteps,
10692 bool &UseOneConst,
10693 bool Reciprocal) const {
10695 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10696 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10697 DAG, ExtraSteps)) {
10698 SDLoc DL(Operand);
10699 EVT VT = Operand.getValueType();
10700
10702 Flags.setAllowReassociation(true);
10703
10704 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10705 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10706 for (int i = ExtraSteps; i > 0; --i) {
10707 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10708 Flags);
10709 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10710 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10711 }
10712 if (!Reciprocal)
10713 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10714
10715 ExtraSteps = 0;
10716 return Estimate;
10717 }
10718
10719 return SDValue();
10720}
10721
10722SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10723 SelectionDAG &DAG, int Enabled,
10724 int &ExtraSteps) const {
10726 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10727 DAG, ExtraSteps)) {
10728 SDLoc DL(Operand);
10729 EVT VT = Operand.getValueType();
10730
10732 Flags.setAllowReassociation(true);
10733
10734 // Newton reciprocal iteration: E * (2 - X * E)
10735 // AArch64 reciprocal iteration instruction: (2 - M * N)
10736 for (int i = ExtraSteps; i > 0; --i) {
10737 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10738 Estimate, Flags);
10739 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10740 }
10741
10742 ExtraSteps = 0;
10743 return Estimate;
10744 }
10745
10746 return SDValue();
10747}
10748
10749//===----------------------------------------------------------------------===//
10750// AArch64 Inline Assembly Support
10751//===----------------------------------------------------------------------===//
10752
10753// Table of Constraints
10754// TODO: This is the current set of constraints supported by ARM for the
10755// compiler, not all of them may make sense.
10756//
10757// r - A general register
10758// w - An FP/SIMD register of some size in the range v0-v31
10759// x - An FP/SIMD register of some size in the range v0-v15
10760// I - Constant that can be used with an ADD instruction
10761// J - Constant that can be used with a SUB instruction
10762// K - Constant that can be used with a 32-bit logical instruction
10763// L - Constant that can be used with a 64-bit logical instruction
10764// M - Constant that can be used as a 32-bit MOV immediate
10765// N - Constant that can be used as a 64-bit MOV immediate
10766// Q - A memory reference with base register and no offset
10767// S - A symbolic address
10768// Y - Floating point constant zero
10769// Z - Integer constant zero
10770//
10771// Note that general register operands will be output using their 64-bit x
10772// register name, whatever the size of the variable, unless the asm operand
10773// is prefixed by the %w modifier. Floating-point and SIMD register operands
10774// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10775// %q modifier.
10776const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10777 // At this point, we have to lower this constraint to something else, so we
10778 // lower it to an "r" or "w". However, by doing this we will force the result
10779 // to be in register, while the X constraint is much more permissive.
10780 //
10781 // Although we are correct (we are free to emit anything, without
10782 // constraints), we might break use cases that would expect us to be more
10783 // efficient and emit something else.
10784 if (!Subtarget->hasFPARMv8())
10785 return "r";
10786
10787 if (ConstraintVT.isFloatingPoint())
10788 return "w";
10789
10790 if (ConstraintVT.isVector() &&
10791 (ConstraintVT.getSizeInBits() == 64 ||
10792 ConstraintVT.getSizeInBits() == 128))
10793 return "w";
10794
10795 return "r";
10796}
10797
10799
10800static std::optional<PredicateConstraint>
10803 .Case("Uph", PredicateConstraint::Uph)
10804 .Case("Upl", PredicateConstraint::Upl)
10805 .Case("Upa", PredicateConstraint::Upa)
10806 .Default(std::nullopt);
10807}
10808
10809static const TargetRegisterClass *
10811 if (VT != MVT::aarch64svcount &&
10812 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10813 return nullptr;
10814
10815 switch (Constraint) {
10816 case PredicateConstraint::Uph:
10817 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10818 : &AArch64::PPR_p8to15RegClass;
10819 case PredicateConstraint::Upl:
10820 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10821 : &AArch64::PPR_3bRegClass;
10822 case PredicateConstraint::Upa:
10823 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10824 : &AArch64::PPRRegClass;
10825 }
10826
10827 llvm_unreachable("Missing PredicateConstraint!");
10828}
10829
10831
10832static std::optional<ReducedGprConstraint>
10835 .Case("Uci", ReducedGprConstraint::Uci)
10836 .Case("Ucj", ReducedGprConstraint::Ucj)
10837 .Default(std::nullopt);
10838}
10839
10840static const TargetRegisterClass *
10842 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10843 return nullptr;
10844
10845 switch (Constraint) {
10846 case ReducedGprConstraint::Uci:
10847 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10848 case ReducedGprConstraint::Ucj:
10849 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10850 }
10851
10852 llvm_unreachable("Missing ReducedGprConstraint!");
10853}
10854
10855// The set of cc code supported is from
10856// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10859 .Case("{@cchi}", AArch64CC::HI)
10860 .Case("{@cccs}", AArch64CC::HS)
10861 .Case("{@cclo}", AArch64CC::LO)
10862 .Case("{@ccls}", AArch64CC::LS)
10863 .Case("{@cccc}", AArch64CC::LO)
10864 .Case("{@cceq}", AArch64CC::EQ)
10865 .Case("{@ccgt}", AArch64CC::GT)
10866 .Case("{@ccge}", AArch64CC::GE)
10867 .Case("{@cclt}", AArch64CC::LT)
10868 .Case("{@ccle}", AArch64CC::LE)
10869 .Case("{@cchs}", AArch64CC::HS)
10870 .Case("{@ccne}", AArch64CC::NE)
10871 .Case("{@ccvc}", AArch64CC::VC)
10872 .Case("{@ccpl}", AArch64CC::PL)
10873 .Case("{@ccvs}", AArch64CC::VS)
10874 .Case("{@ccmi}", AArch64CC::MI)
10876 return Cond;
10877}
10878
10879/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10880/// WZR, invert(<cond>)'.
10882 SelectionDAG &DAG) {
10883 return DAG.getNode(
10884 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10885 DAG.getConstant(0, DL, MVT::i32),
10886 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10887}
10888
10889// Lower @cc flag output via getSETCC.
10890SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10891 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10892 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10893 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10894 if (Cond == AArch64CC::Invalid)
10895 return SDValue();
10896 // The output variable should be a scalar integer.
10897 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10898 OpInfo.ConstraintVT.getSizeInBits() < 8)
10899 report_fatal_error("Flag output operand is of invalid type");
10900
10901 // Get NZCV register. Only update chain when copyfrom is glued.
10902 if (Glue.getNode()) {
10903 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10904 Chain = Glue.getValue(1);
10905 } else
10906 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10907 // Extract CC code.
10908 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10909
10911
10912 // Truncate or ZERO_EXTEND based on value types.
10913 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10914 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10915 else
10916 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10917
10918 return Result;
10919}
10920
10921/// getConstraintType - Given a constraint letter, return the type of
10922/// constraint it is for this target.
10924AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10925 if (Constraint.size() == 1) {
10926 switch (Constraint[0]) {
10927 default:
10928 break;
10929 case 'x':
10930 case 'w':
10931 case 'y':
10932 return C_RegisterClass;
10933 // An address with a single base register. Due to the way we
10934 // currently handle addresses it is the same as 'r'.
10935 case 'Q':
10936 return C_Memory;
10937 case 'I':
10938 case 'J':
10939 case 'K':
10940 case 'L':
10941 case 'M':
10942 case 'N':
10943 case 'Y':
10944 case 'Z':
10945 return C_Immediate;
10946 case 'z':
10947 case 'S': // A symbol or label reference with a constant offset
10948 return C_Other;
10949 }
10950 } else if (parsePredicateConstraint(Constraint))
10951 return C_RegisterClass;
10952 else if (parseReducedGprConstraint(Constraint))
10953 return C_RegisterClass;
10954 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10955 return C_Other;
10956 return TargetLowering::getConstraintType(Constraint);
10957}
10958
10959/// Examine constraint type and operand type and determine a weight value.
10960/// This object must already have been set up with the operand type
10961/// and the current alternative constraint selected.
10963AArch64TargetLowering::getSingleConstraintMatchWeight(
10964 AsmOperandInfo &info, const char *constraint) const {
10966 Value *CallOperandVal = info.CallOperandVal;
10967 // If we don't have a value, we can't do a match,
10968 // but allow it at the lowest weight.
10969 if (!CallOperandVal)
10970 return CW_Default;
10971 Type *type = CallOperandVal->getType();
10972 // Look at the constraint type.
10973 switch (*constraint) {
10974 default:
10976 break;
10977 case 'x':
10978 case 'w':
10979 case 'y':
10980 if (type->isFloatingPointTy() || type->isVectorTy())
10981 weight = CW_Register;
10982 break;
10983 case 'z':
10984 weight = CW_Constant;
10985 break;
10986 case 'U':
10987 if (parsePredicateConstraint(constraint) ||
10988 parseReducedGprConstraint(constraint))
10989 weight = CW_Register;
10990 break;
10991 }
10992 return weight;
10993}
10994
10995std::pair<unsigned, const TargetRegisterClass *>
10996AArch64TargetLowering::getRegForInlineAsmConstraint(
10997 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10998 if (Constraint.size() == 1) {
10999 switch (Constraint[0]) {
11000 case 'r':
11001 if (VT.isScalableVector())
11002 return std::make_pair(0U, nullptr);
11003 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11004 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11005 if (VT.getFixedSizeInBits() == 64)
11006 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11007 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11008 case 'w': {
11009 if (!Subtarget->hasFPARMv8())
11010 break;
11011 if (VT.isScalableVector()) {
11012 if (VT.getVectorElementType() != MVT::i1)
11013 return std::make_pair(0U, &AArch64::ZPRRegClass);
11014 return std::make_pair(0U, nullptr);
11015 }
11016 uint64_t VTSize = VT.getFixedSizeInBits();
11017 if (VTSize == 16)
11018 return std::make_pair(0U, &AArch64::FPR16RegClass);
11019 if (VTSize == 32)
11020 return std::make_pair(0U, &AArch64::FPR32RegClass);
11021 if (VTSize == 64)
11022 return std::make_pair(0U, &AArch64::FPR64RegClass);
11023 if (VTSize == 128)
11024 return std::make_pair(0U, &AArch64::FPR128RegClass);
11025 break;
11026 }
11027 // The instructions that this constraint is designed for can
11028 // only take 128-bit registers so just use that regclass.
11029 case 'x':
11030 if (!Subtarget->hasFPARMv8())
11031 break;
11032 if (VT.isScalableVector())
11033 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11034 if (VT.getSizeInBits() == 128)
11035 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11036 break;
11037 case 'y':
11038 if (!Subtarget->hasFPARMv8())
11039 break;
11040 if (VT.isScalableVector())
11041 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11042 break;
11043 }
11044 } else {
11045 if (const auto PC = parsePredicateConstraint(Constraint))
11046 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11047 return std::make_pair(0U, RegClass);
11048
11049 if (const auto RGC = parseReducedGprConstraint(Constraint))
11050 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11051 return std::make_pair(0U, RegClass);
11052 }
11053 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11055 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11056
11057 if (Constraint == "{za}") {
11058 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11059 }
11060
11061 if (Constraint == "{zt0}") {
11062 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11063 }
11064
11065 // Use the default implementation in TargetLowering to convert the register
11066 // constraint into a member of a register class.
11067 std::pair<unsigned, const TargetRegisterClass *> Res;
11069
11070 // Not found as a standard register?
11071 if (!Res.second) {
11072 unsigned Size = Constraint.size();
11073 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11074 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11075 int RegNo;
11076 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11077 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11078 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11079 // By default we'll emit v0-v31 for this unless there's a modifier where
11080 // we'll emit the correct register as well.
11081 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11082 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11083 Res.second = &AArch64::FPR64RegClass;
11084 } else {
11085 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11086 Res.second = &AArch64::FPR128RegClass;
11087 }
11088 }
11089 }
11090 }
11091
11092 if (Res.second && !Subtarget->hasFPARMv8() &&
11093 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11094 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11095 return std::make_pair(0U, nullptr);
11096
11097 return Res;
11098}
11099
11101 llvm::Type *Ty,
11102 bool AllowUnknown) const {
11103 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11104 return EVT(MVT::i64x8);
11105
11106 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11107}
11108
11109/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11110/// vector. If it is invalid, don't add anything to Ops.
11111void AArch64TargetLowering::LowerAsmOperandForConstraint(
11112 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11113 SelectionDAG &DAG) const {
11114 SDValue Result;
11115
11116 // Currently only support length 1 constraints.
11117 if (Constraint.size() != 1)
11118 return;
11119
11120 char ConstraintLetter = Constraint[0];
11121 switch (ConstraintLetter) {
11122 default:
11123 break;
11124
11125 // This set of constraints deal with valid constants for various instructions.
11126 // Validate and return a target constant for them if we can.
11127 case 'z': {
11128 // 'z' maps to xzr or wzr so it needs an input of 0.
11129 if (!isNullConstant(Op))
11130 return;
11131
11132 if (Op.getValueType() == MVT::i64)
11133 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11134 else
11135 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11136 break;
11137 }
11138 case 'S':
11139 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11140 // supported for PIC while "s" isn't, making "s" less useful. We implement
11141 // "S" but not "s".
11143 break;
11144
11145 case 'I':
11146 case 'J':
11147 case 'K':
11148 case 'L':
11149 case 'M':
11150 case 'N':
11151 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11152 if (!C)
11153 return;
11154
11155 // Grab the value and do some validation.
11156 uint64_t CVal = C->getZExtValue();
11157 switch (ConstraintLetter) {
11158 // The I constraint applies only to simple ADD or SUB immediate operands:
11159 // i.e. 0 to 4095 with optional shift by 12
11160 // The J constraint applies only to ADD or SUB immediates that would be
11161 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11162 // instruction [or vice versa], in other words -1 to -4095 with optional
11163 // left shift by 12.
11164 case 'I':
11165 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11166 break;
11167 return;
11168 case 'J': {
11169 uint64_t NVal = -C->getSExtValue();
11170 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11171 CVal = C->getSExtValue();
11172 break;
11173 }
11174 return;
11175 }
11176 // The K and L constraints apply *only* to logical immediates, including
11177 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11178 // been removed and MOV should be used). So these constraints have to
11179 // distinguish between bit patterns that are valid 32-bit or 64-bit
11180 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11181 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11182 // versa.
11183 case 'K':
11184 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11185 break;
11186 return;
11187 case 'L':
11188 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11189 break;
11190 return;
11191 // The M and N constraints are a superset of K and L respectively, for use
11192 // with the MOV (immediate) alias. As well as the logical immediates they
11193 // also match 32 or 64-bit immediates that can be loaded either using a
11194 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11195 // (M) or 64-bit 0x1234000000000000 (N) etc.
11196 // As a note some of this code is liberally stolen from the asm parser.
11197 case 'M': {
11198 if (!isUInt<32>(CVal))
11199 return;
11200 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11201 break;
11202 if ((CVal & 0xFFFF) == CVal)
11203 break;
11204 if ((CVal & 0xFFFF0000ULL) == CVal)
11205 break;
11206 uint64_t NCVal = ~(uint32_t)CVal;
11207 if ((NCVal & 0xFFFFULL) == NCVal)
11208 break;
11209 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11210 break;
11211 return;
11212 }
11213 case 'N': {
11214 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11215 break;
11216 if ((CVal & 0xFFFFULL) == CVal)
11217 break;
11218 if ((CVal & 0xFFFF0000ULL) == CVal)
11219 break;
11220 if ((CVal & 0xFFFF00000000ULL) == CVal)
11221 break;
11222 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11223 break;
11224 uint64_t NCVal = ~CVal;
11225 if ((NCVal & 0xFFFFULL) == NCVal)
11226 break;
11227 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11228 break;
11229 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11230 break;
11231 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11232 break;
11233 return;
11234 }
11235 default:
11236 return;
11237 }
11238
11239 // All assembler immediates are 64-bit integers.
11240 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11241 break;
11242 }
11243
11244 if (Result.getNode()) {
11245 Ops.push_back(Result);
11246 return;
11247 }
11248
11249 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11250}
11251
11252//===----------------------------------------------------------------------===//
11253// AArch64 Advanced SIMD Support
11254//===----------------------------------------------------------------------===//
11255
11256/// WidenVector - Given a value in the V64 register class, produce the
11257/// equivalent value in the V128 register class.
11259 EVT VT = V64Reg.getValueType();
11260 unsigned NarrowSize = VT.getVectorNumElements();
11261 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11262 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11263 SDLoc DL(V64Reg);
11264
11265 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11266 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11267}
11268
11269/// getExtFactor - Determine the adjustment factor for the position when
11270/// generating an "extract from vector registers" instruction.
11271static unsigned getExtFactor(SDValue &V) {
11272 EVT EltType = V.getValueType().getVectorElementType();
11273 return EltType.getSizeInBits() / 8;
11274}
11275
11276// Check if a vector is built from one vector via extracted elements of
11277// another together with an AND mask, ensuring that all elements fit
11278// within range. This can be reconstructed using AND and NEON's TBL1.
11280 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11281 SDLoc dl(Op);
11282 EVT VT = Op.getValueType();
11283 assert(!VT.isScalableVector() &&
11284 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11285
11286 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11287 // directly to TBL1.
11288 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11289 return SDValue();
11290
11291 unsigned NumElts = VT.getVectorNumElements();
11292 assert((NumElts == 8 || NumElts == 16) &&
11293 "Need to have exactly 8 or 16 elements in vector.");
11294
11295 SDValue SourceVec;
11296 SDValue MaskSourceVec;
11297 SmallVector<SDValue, 16> AndMaskConstants;
11298
11299 for (unsigned i = 0; i < NumElts; ++i) {
11300 SDValue V = Op.getOperand(i);
11301 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11302 return SDValue();
11303
11304 SDValue OperandSourceVec = V.getOperand(0);
11305 if (!SourceVec)
11306 SourceVec = OperandSourceVec;
11307 else if (SourceVec != OperandSourceVec)
11308 return SDValue();
11309
11310 // This only looks at shuffles with elements that are
11311 // a) truncated by a constant AND mask extracted from a mask vector, or
11312 // b) extracted directly from a mask vector.
11313 SDValue MaskSource = V.getOperand(1);
11314 if (MaskSource.getOpcode() == ISD::AND) {
11315 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11316 return SDValue();
11317
11318 AndMaskConstants.push_back(MaskSource.getOperand(1));
11319 MaskSource = MaskSource->getOperand(0);
11320 } else if (!AndMaskConstants.empty()) {
11321 // Either all or no operands should have an AND mask.
11322 return SDValue();
11323 }
11324
11325 // An ANY_EXTEND may be inserted between the AND and the source vector
11326 // extraction. We don't care about that, so we can just skip it.
11327 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11328 MaskSource = MaskSource.getOperand(0);
11329
11330 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11331 return SDValue();
11332
11333 SDValue MaskIdx = MaskSource.getOperand(1);
11334 if (!isa<ConstantSDNode>(MaskIdx) ||
11335 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11336 return SDValue();
11337
11338 // We only apply this if all elements come from the same vector with the
11339 // same vector type.
11340 if (!MaskSourceVec) {
11341 MaskSourceVec = MaskSource->getOperand(0);
11342 if (MaskSourceVec.getValueType() != VT)
11343 return SDValue();
11344 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11345 return SDValue();
11346 }
11347 }
11348
11349 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11350 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11351 // insert, we know that the index in the mask must be smaller than the number
11352 // of elements in the source, or we would have an out-of-bounds access.
11353 if (NumElts == 8)
11354 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11355 DAG.getUNDEF(VT));
11356
11357 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11358 if (!AndMaskConstants.empty())
11359 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11360 DAG.getBuildVector(VT, dl, AndMaskConstants));
11361
11362 return DAG.getNode(
11364 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11365 MaskSourceVec);
11366}
11367
11368// Gather data to see if the operation can be modelled as a
11369// shuffle in combination with VEXTs.
11371 SelectionDAG &DAG) const {
11372 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11373 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11374 SDLoc dl(Op);
11375 EVT VT = Op.getValueType();
11376 assert(!VT.isScalableVector() &&
11377 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11378 unsigned NumElts = VT.getVectorNumElements();
11379
11380 struct ShuffleSourceInfo {
11381 SDValue Vec;
11382 unsigned MinElt;
11383 unsigned MaxElt;
11384
11385 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11386 // be compatible with the shuffle we intend to construct. As a result
11387 // ShuffleVec will be some sliding window into the original Vec.
11388 SDValue ShuffleVec;
11389
11390 // Code should guarantee that element i in Vec starts at element "WindowBase
11391 // + i * WindowScale in ShuffleVec".
11392 int WindowBase;
11393 int WindowScale;
11394
11395 ShuffleSourceInfo(SDValue Vec)
11396 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11397 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11398
11399 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11400 };
11401
11402 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11403 // node.
11405 for (unsigned i = 0; i < NumElts; ++i) {
11406 SDValue V = Op.getOperand(i);
11407 if (V.isUndef())
11408 continue;
11409 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11410 !isa<ConstantSDNode>(V.getOperand(1)) ||
11411 V.getOperand(0).getValueType().isScalableVector()) {
11412 LLVM_DEBUG(
11413 dbgs() << "Reshuffle failed: "
11414 "a shuffle can only come from building a vector from "
11415 "various elements of other fixed-width vectors, provided "
11416 "their indices are constant\n");
11417 return SDValue();
11418 }
11419
11420 // Add this element source to the list if it's not already there.
11421 SDValue SourceVec = V.getOperand(0);
11422 auto Source = find(Sources, SourceVec);
11423 if (Source == Sources.end())
11424 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11425
11426 // Update the minimum and maximum lane number seen.
11427 unsigned EltNo = V.getConstantOperandVal(1);
11428 Source->MinElt = std::min(Source->MinElt, EltNo);
11429 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11430 }
11431
11432 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11433 // better than moving to/from gpr registers for larger vectors.
11434 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11435 // Construct a mask for the tbl. We may need to adjust the index for types
11436 // larger than i8.
11438 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11439 for (unsigned I = 0; I < NumElts; ++I) {
11440 SDValue V = Op.getOperand(I);
11441 if (V.isUndef()) {
11442 for (unsigned OF = 0; OF < OutputFactor; OF++)
11443 Mask.push_back(-1);
11444 continue;
11445 }
11446 // Set the Mask lanes adjusted for the size of the input and output
11447 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11448 // output element, adjusted in their positions per input and output types.
11449 unsigned Lane = V.getConstantOperandVal(1);
11450 for (unsigned S = 0; S < Sources.size(); S++) {
11451 if (V.getOperand(0) == Sources[S].Vec) {
11452 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11453 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11454 for (unsigned OF = 0; OF < OutputFactor; OF++)
11455 Mask.push_back(InputBase + OF);
11456 break;
11457 }
11458 }
11459 }
11460
11461 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11462 // v16i8, and the TBLMask
11463 SmallVector<SDValue, 16> TBLOperands;
11464 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11465 ? Intrinsic::aarch64_neon_tbl3
11466 : Intrinsic::aarch64_neon_tbl4,
11467 dl, MVT::i32));
11468 for (unsigned i = 0; i < Sources.size(); i++) {
11469 SDValue Src = Sources[i].Vec;
11470 EVT SrcVT = Src.getValueType();
11471 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11472 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11473 "Expected a legally typed vector");
11474 if (SrcVT.is64BitVector())
11475 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11476 DAG.getUNDEF(MVT::v8i8));
11477 TBLOperands.push_back(Src);
11478 }
11479
11481 for (unsigned i = 0; i < Mask.size(); i++)
11482 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11483 assert((Mask.size() == 8 || Mask.size() == 16) &&
11484 "Expected a v8i8 or v16i8 Mask");
11485 TBLOperands.push_back(
11486 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11487
11488 SDValue Shuffle =
11490 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11491 return DAG.getBitcast(VT, Shuffle);
11492 }
11493
11494 if (Sources.size() > 2) {
11495 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11496 << "sensible when at most two source vectors are "
11497 << "involved\n");
11498 return SDValue();
11499 }
11500
11501 // Find out the smallest element size among result and two sources, and use
11502 // it as element size to build the shuffle_vector.
11503 EVT SmallestEltTy = VT.getVectorElementType();
11504 for (auto &Source : Sources) {
11505 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11506 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11507 SmallestEltTy = SrcEltTy;
11508 }
11509 }
11510 unsigned ResMultiplier =
11511 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11512 uint64_t VTSize = VT.getFixedSizeInBits();
11513 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11514 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11515
11516 // If the source vector is too wide or too narrow, we may nevertheless be able
11517 // to construct a compatible shuffle either by concatenating it with UNDEF or
11518 // extracting a suitable range of elements.
11519 for (auto &Src : Sources) {
11520 EVT SrcVT = Src.ShuffleVec.getValueType();
11521
11522 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11523 if (SrcVTSize == TypeSize::getFixed(VTSize))
11524 continue;
11525
11526 // This stage of the search produces a source with the same element type as
11527 // the original, but with a total width matching the BUILD_VECTOR output.
11528 EVT EltVT = SrcVT.getVectorElementType();
11529 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11530 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11531
11532 if (SrcVTSize.getFixedValue() < VTSize) {
11533 assert(2 * SrcVTSize == VTSize);
11534 // We can pad out the smaller vector for free, so if it's part of a
11535 // shuffle...
11536 Src.ShuffleVec =
11537 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11538 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11539 continue;
11540 }
11541
11542 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11543 LLVM_DEBUG(
11544 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11545 return SDValue();
11546 }
11547
11548 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11549 LLVM_DEBUG(
11550 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11551 return SDValue();
11552 }
11553
11554 if (Src.MinElt >= NumSrcElts) {
11555 // The extraction can just take the second half
11556 Src.ShuffleVec =
11557 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11558 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11559 Src.WindowBase = -NumSrcElts;
11560 } else if (Src.MaxElt < NumSrcElts) {
11561 // The extraction can just take the first half
11562 Src.ShuffleVec =
11563 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11564 DAG.getConstant(0, dl, MVT::i64));
11565 } else {
11566 // An actual VEXT is needed
11567 SDValue VEXTSrc1 =
11568 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11569 DAG.getConstant(0, dl, MVT::i64));
11570 SDValue VEXTSrc2 =
11571 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11572 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11573 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11574
11575 if (!SrcVT.is64BitVector()) {
11576 LLVM_DEBUG(
11577 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11578 "for SVE vectors.");
11579 return SDValue();
11580 }
11581
11582 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11583 VEXTSrc2,
11584 DAG.getConstant(Imm, dl, MVT::i32));
11585 Src.WindowBase = -Src.MinElt;
11586 }
11587 }
11588
11589 // Another possible incompatibility occurs from the vector element types. We
11590 // can fix this by bitcasting the source vectors to the same type we intend
11591 // for the shuffle.
11592 for (auto &Src : Sources) {
11593 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11594 if (SrcEltTy == SmallestEltTy)
11595 continue;
11596 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11597 if (DAG.getDataLayout().isBigEndian()) {
11598 Src.ShuffleVec =
11599 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11600 } else {
11601 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11602 }
11603 Src.WindowScale =
11604 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11605 Src.WindowBase *= Src.WindowScale;
11606 }
11607
11608 // Final check before we try to actually produce a shuffle.
11609 LLVM_DEBUG(for (auto Src
11610 : Sources)
11611 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11612
11613 // The stars all align, our next step is to produce the mask for the shuffle.
11614 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11615 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11616 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11617 SDValue Entry = Op.getOperand(i);
11618 if (Entry.isUndef())
11619 continue;
11620
11621 auto Src = find(Sources, Entry.getOperand(0));
11622 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11623
11624 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11625 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11626 // segment.
11627 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11628 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11629 VT.getScalarSizeInBits());
11630 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11631
11632 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11633 // starting at the appropriate offset.
11634 int *LaneMask = &Mask[i * ResMultiplier];
11635
11636 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11637 ExtractBase += NumElts * (Src - Sources.begin());
11638 for (int j = 0; j < LanesDefined; ++j)
11639 LaneMask[j] = ExtractBase + j;
11640 }
11641
11642 // Final check before we try to produce nonsense...
11643 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11644 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11645 return SDValue();
11646 }
11647
11648 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11649 for (unsigned i = 0; i < Sources.size(); ++i)
11650 ShuffleOps[i] = Sources[i].ShuffleVec;
11651
11652 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11653 ShuffleOps[1], Mask);
11654 SDValue V;
11655 if (DAG.getDataLayout().isBigEndian()) {
11656 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11657 } else {
11658 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11659 }
11660
11661 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11662 dbgs() << "Reshuffle, creating node: "; V.dump(););
11663
11664 return V;
11665}
11666
11667// check if an EXT instruction can handle the shuffle mask when the
11668// vector sources of the shuffle are the same.
11669static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11670 unsigned NumElts = VT.getVectorNumElements();
11671
11672 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11673 if (M[0] < 0)
11674 return false;
11675
11676 Imm = M[0];
11677
11678 // If this is a VEXT shuffle, the immediate value is the index of the first
11679 // element. The other shuffle indices must be the successive elements after
11680 // the first one.
11681 unsigned ExpectedElt = Imm;
11682 for (unsigned i = 1; i < NumElts; ++i) {
11683 // Increment the expected index. If it wraps around, just follow it
11684 // back to index zero and keep going.
11685 ++ExpectedElt;
11686 if (ExpectedElt == NumElts)
11687 ExpectedElt = 0;
11688
11689 if (M[i] < 0)
11690 continue; // ignore UNDEF indices
11691 if (ExpectedElt != static_cast<unsigned>(M[i]))
11692 return false;
11693 }
11694
11695 return true;
11696}
11697
11698// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11699// v4i32s. This is really a truncate, which we can construct out of (legal)
11700// concats and truncate nodes.
11702 if (V.getValueType() != MVT::v16i8)
11703 return SDValue();
11704 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11705
11706 for (unsigned X = 0; X < 4; X++) {
11707 // Check the first item in each group is an extract from lane 0 of a v4i32
11708 // or v4i16.
11709 SDValue BaseExt = V.getOperand(X * 4);
11710 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11711 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11712 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11713 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11714 BaseExt.getConstantOperandVal(1) != 0)
11715 return SDValue();
11716 SDValue Base = BaseExt.getOperand(0);
11717 // And check the other items are extracts from the same vector.
11718 for (unsigned Y = 1; Y < 4; Y++) {
11719 SDValue Ext = V.getOperand(X * 4 + Y);
11720 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11721 Ext.getOperand(0) != Base ||
11722 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11723 Ext.getConstantOperandVal(1) != Y)
11724 return SDValue();
11725 }
11726 }
11727
11728 // Turn the buildvector into a series of truncates and concates, which will
11729 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11730 // concat together to produce 2 v8i16. These are both truncated and concat
11731 // together.
11732 SDLoc DL(V);
11733 SDValue Trunc[4] = {
11734 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11735 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11736 for (SDValue &V : Trunc)
11737 if (V.getValueType() == MVT::v4i32)
11738 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11739 SDValue Concat0 =
11740 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11741 SDValue Concat1 =
11742 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11743 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11744 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11745 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11746}
11747
11748/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11749/// element width than the vector lane type. If that is the case the function
11750/// returns true and writes the value of the DUP instruction lane operand into
11751/// DupLaneOp
11752static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11753 unsigned &DupLaneOp) {
11754 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11755 "Only possible block sizes for wide DUP are: 16, 32, 64");
11756
11757 if (BlockSize <= VT.getScalarSizeInBits())
11758 return false;
11759 if (BlockSize % VT.getScalarSizeInBits() != 0)
11760 return false;
11761 if (VT.getSizeInBits() % BlockSize != 0)
11762 return false;
11763
11764 size_t SingleVecNumElements = VT.getVectorNumElements();
11765 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11766 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11767
11768 // We are looking for masks like
11769 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11770 // might be replaced by 'undefined'. BlockIndices will eventually contain
11771 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11772 // for the above examples)
11773 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11774 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11775 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11776 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11777 if (Elt < 0)
11778 continue;
11779 // For now we don't support shuffles that use the second operand
11780 if ((unsigned)Elt >= SingleVecNumElements)
11781 return false;
11782 if (BlockElts[I] < 0)
11783 BlockElts[I] = Elt;
11784 else if (BlockElts[I] != Elt)
11785 return false;
11786 }
11787
11788 // We found a candidate block (possibly with some undefs). It must be a
11789 // sequence of consecutive integers starting with a value divisible by
11790 // NumEltsPerBlock with some values possibly replaced by undef-s.
11791
11792 // Find first non-undef element
11793 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11794 assert(FirstRealEltIter != BlockElts.end() &&
11795 "Shuffle with all-undefs must have been caught by previous cases, "
11796 "e.g. isSplat()");
11797 if (FirstRealEltIter == BlockElts.end()) {
11798 DupLaneOp = 0;
11799 return true;
11800 }
11801
11802 // Index of FirstRealElt in BlockElts
11803 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11804
11805 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11806 return false;
11807 // BlockElts[0] must have the following value if it isn't undef:
11808 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11809
11810 // Check the first element
11811 if (Elt0 % NumEltsPerBlock != 0)
11812 return false;
11813 // Check that the sequence indeed consists of consecutive integers (modulo
11814 // undefs)
11815 for (size_t I = 0; I < NumEltsPerBlock; I++)
11816 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11817 return false;
11818
11819 DupLaneOp = Elt0 / NumEltsPerBlock;
11820 return true;
11821}
11822
11823// check if an EXT instruction can handle the shuffle mask when the
11824// vector sources of the shuffle are different.
11825static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11826 unsigned &Imm) {
11827 // Look for the first non-undef element.
11828 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11829
11830 // Benefit form APInt to handle overflow when calculating expected element.
11831 unsigned NumElts = VT.getVectorNumElements();
11832 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11833 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11834 // The following shuffle indices must be the successive elements after the
11835 // first real element.
11836 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11837 return Elt != ExpectedElt++ && Elt != -1;
11838 });
11839 if (FoundWrongElt)
11840 return false;
11841
11842 // The index of an EXT is the first element if it is not UNDEF.
11843 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11844 // value of the first element. E.g.
11845 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11846 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11847 // ExpectedElt is the last mask index plus 1.
11848 Imm = ExpectedElt.getZExtValue();
11849
11850 // There are two difference cases requiring to reverse input vectors.
11851 // For example, for vector <4 x i32> we have the following cases,
11852 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11853 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11854 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11855 // to reverse two input vectors.
11856 if (Imm < NumElts)
11857 ReverseEXT = true;
11858 else
11859 Imm -= NumElts;
11860
11861 return true;
11862}
11863
11864/// isREVMask - Check if a vector shuffle corresponds to a REV
11865/// instruction with the specified blocksize. (The order of the elements
11866/// within each block of the vector is reversed.)
11867static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11868 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11869 BlockSize == 128) &&
11870 "Only possible block sizes for REV are: 16, 32, 64, 128");
11871
11872 unsigned EltSz = VT.getScalarSizeInBits();
11873 unsigned NumElts = VT.getVectorNumElements();
11874 unsigned BlockElts = M[0] + 1;
11875 // If the first shuffle index is UNDEF, be optimistic.
11876 if (M[0] < 0)
11877 BlockElts = BlockSize / EltSz;
11878
11879 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11880 return false;
11881
11882 for (unsigned i = 0; i < NumElts; ++i) {
11883 if (M[i] < 0)
11884 continue; // ignore UNDEF indices
11885 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11886 return false;
11887 }
11888
11889 return true;
11890}
11891
11892static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11893 unsigned NumElts = VT.getVectorNumElements();
11894 if (NumElts % 2 != 0)
11895 return false;
11896 WhichResult = (M[0] == 0 ? 0 : 1);
11897 for (unsigned i = 0; i < NumElts; i += 2) {
11898 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11899 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11900 return false;
11901 }
11902 return true;
11903}
11904
11905/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11906/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11907/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11908static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11909 unsigned NumElts = VT.getVectorNumElements();
11910 if (NumElts % 2 != 0)
11911 return false;
11912 WhichResult = (M[0] == 0 ? 0 : 1);
11913 unsigned Idx = WhichResult * NumElts / 2;
11914 for (unsigned i = 0; i != NumElts; i += 2) {
11915 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11916 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11917 return false;
11918 Idx += 1;
11919 }
11920
11921 return true;
11922}
11923
11924/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11925/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11926/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11927static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11928 unsigned Half = VT.getVectorNumElements() / 2;
11929 WhichResult = (M[0] == 0 ? 0 : 1);
11930 for (unsigned j = 0; j != 2; ++j) {
11931 unsigned Idx = WhichResult;
11932 for (unsigned i = 0; i != Half; ++i) {
11933 int MIdx = M[i + j * Half];
11934 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11935 return false;
11936 Idx += 2;
11937 }
11938 }
11939
11940 return true;
11941}
11942
11943/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11944/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11945/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11946static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11947 unsigned NumElts = VT.getVectorNumElements();
11948 if (NumElts % 2 != 0)
11949 return false;
11950 WhichResult = (M[0] == 0 ? 0 : 1);
11951 for (unsigned i = 0; i < NumElts; i += 2) {
11952 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11953 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11954 return false;
11955 }
11956 return true;
11957}
11958
11959static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11960 bool &DstIsLeft, int &Anomaly) {
11961 if (M.size() != static_cast<size_t>(NumInputElements))
11962 return false;
11963
11964 int NumLHSMatch = 0, NumRHSMatch = 0;
11965 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11966
11967 for (int i = 0; i < NumInputElements; ++i) {
11968 if (M[i] == -1) {
11969 ++NumLHSMatch;
11970 ++NumRHSMatch;
11971 continue;
11972 }
11973
11974 if (M[i] == i)
11975 ++NumLHSMatch;
11976 else
11977 LastLHSMismatch = i;
11978
11979 if (M[i] == i + NumInputElements)
11980 ++NumRHSMatch;
11981 else
11982 LastRHSMismatch = i;
11983 }
11984
11985 if (NumLHSMatch == NumInputElements - 1) {
11986 DstIsLeft = true;
11987 Anomaly = LastLHSMismatch;
11988 return true;
11989 } else if (NumRHSMatch == NumInputElements - 1) {
11990 DstIsLeft = false;
11991 Anomaly = LastRHSMismatch;
11992 return true;
11993 }
11994
11995 return false;
11996}
11997
11998static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11999 if (VT.getSizeInBits() != 128)
12000 return false;
12001
12002 unsigned NumElts = VT.getVectorNumElements();
12003
12004 for (int I = 0, E = NumElts / 2; I != E; I++) {
12005 if (Mask[I] != I)
12006 return false;
12007 }
12008
12009 int Offset = NumElts / 2;
12010 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12011 if (Mask[I] != I + SplitLHS * Offset)
12012 return false;
12013 }
12014
12015 return true;
12016}
12017
12019 SDLoc DL(Op);
12020 EVT VT = Op.getValueType();
12021 SDValue V0 = Op.getOperand(0);
12022 SDValue V1 = Op.getOperand(1);
12023 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12024
12027 return SDValue();
12028
12029 bool SplitV0 = V0.getValueSizeInBits() == 128;
12030
12031 if (!isConcatMask(Mask, VT, SplitV0))
12032 return SDValue();
12033
12034 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12035 if (SplitV0) {
12036 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12037 DAG.getConstant(0, DL, MVT::i64));
12038 }
12039 if (V1.getValueSizeInBits() == 128) {
12040 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12041 DAG.getConstant(0, DL, MVT::i64));
12042 }
12043 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12044}
12045
12046/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12047/// the specified operations to build the shuffle. ID is the perfect-shuffle
12048//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12049//table entry and LHS/RHS are the immediate inputs for this stage of the
12050//shuffle.
12052 SDValue V2, unsigned PFEntry, SDValue LHS,
12053 SDValue RHS, SelectionDAG &DAG,
12054 const SDLoc &dl) {
12055 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12056 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12057 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12058
12059 enum {
12060 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12061 OP_VREV,
12062 OP_VDUP0,
12063 OP_VDUP1,
12064 OP_VDUP2,
12065 OP_VDUP3,
12066 OP_VEXT1,
12067 OP_VEXT2,
12068 OP_VEXT3,
12069 OP_VUZPL, // VUZP, left result
12070 OP_VUZPR, // VUZP, right result
12071 OP_VZIPL, // VZIP, left result
12072 OP_VZIPR, // VZIP, right result
12073 OP_VTRNL, // VTRN, left result
12074 OP_VTRNR, // VTRN, right result
12075 OP_MOVLANE // Move lane. RHSID is the lane to move into
12076 };
12077
12078 if (OpNum == OP_COPY) {
12079 if (LHSID == (1 * 9 + 2) * 9 + 3)
12080 return LHS;
12081 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12082 return RHS;
12083 }
12084
12085 if (OpNum == OP_MOVLANE) {
12086 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12087 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12088 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12089 Elt = 3 - Elt;
12090 while (Elt > 0) {
12091 ID /= 9;
12092 Elt--;
12093 }
12094 return (ID % 9 == 8) ? -1 : ID % 9;
12095 };
12096
12097 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12098 // get the lane to move from the PFID, which is always from the
12099 // original vectors (V1 or V2).
12101 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12102 EVT VT = OpLHS.getValueType();
12103 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12104 unsigned ExtLane = 0;
12105 SDValue Input;
12106
12107 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12108 // convert into a higher type.
12109 if (RHSID & 0x4) {
12110 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12111 if (MaskElt == -1)
12112 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12113 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12114 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12115 Input = MaskElt < 2 ? V1 : V2;
12116 if (VT.getScalarSizeInBits() == 16) {
12117 Input = DAG.getBitcast(MVT::v2f32, Input);
12118 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12119 } else {
12120 assert(VT.getScalarSizeInBits() == 32 &&
12121 "Expected 16 or 32 bit shuffle elemements");
12122 Input = DAG.getBitcast(MVT::v2f64, Input);
12123 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12124 }
12125 } else {
12126 int MaskElt = getPFIDLane(ID, RHSID);
12127 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12128 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12129 Input = MaskElt < 4 ? V1 : V2;
12130 // Be careful about creating illegal types. Use f16 instead of i16.
12131 if (VT == MVT::v4i16) {
12132 Input = DAG.getBitcast(MVT::v4f16, Input);
12133 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12134 }
12135 }
12138 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12139 SDValue Ins =
12140 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12141 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12142 return DAG.getBitcast(VT, Ins);
12143 }
12144
12145 SDValue OpLHS, OpRHS;
12146 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12147 RHS, DAG, dl);
12148 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12149 RHS, DAG, dl);
12150 EVT VT = OpLHS.getValueType();
12151
12152 switch (OpNum) {
12153 default:
12154 llvm_unreachable("Unknown shuffle opcode!");
12155 case OP_VREV:
12156 // VREV divides the vector in half and swaps within the half.
12157 if (VT.getVectorElementType() == MVT::i32 ||
12158 VT.getVectorElementType() == MVT::f32)
12159 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12160 // vrev <4 x i16> -> REV32
12161 if (VT.getVectorElementType() == MVT::i16 ||
12162 VT.getVectorElementType() == MVT::f16 ||
12163 VT.getVectorElementType() == MVT::bf16)
12164 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12165 // vrev <4 x i8> -> REV16
12166 assert(VT.getVectorElementType() == MVT::i8);
12167 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12168 case OP_VDUP0:
12169 case OP_VDUP1:
12170 case OP_VDUP2:
12171 case OP_VDUP3: {
12172 EVT EltTy = VT.getVectorElementType();
12173 unsigned Opcode;
12174 if (EltTy == MVT::i8)
12175 Opcode = AArch64ISD::DUPLANE8;
12176 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12177 Opcode = AArch64ISD::DUPLANE16;
12178 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12179 Opcode = AArch64ISD::DUPLANE32;
12180 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12181 Opcode = AArch64ISD::DUPLANE64;
12182 else
12183 llvm_unreachable("Invalid vector element type?");
12184
12185 if (VT.getSizeInBits() == 64)
12186 OpLHS = WidenVector(OpLHS, DAG);
12187 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12188 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12189 }
12190 case OP_VEXT1:
12191 case OP_VEXT2:
12192 case OP_VEXT3: {
12193 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12194 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12195 DAG.getConstant(Imm, dl, MVT::i32));
12196 }
12197 case OP_VUZPL:
12198 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12199 case OP_VUZPR:
12200 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12201 case OP_VZIPL:
12202 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12203 case OP_VZIPR:
12204 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12205 case OP_VTRNL:
12206 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12207 case OP_VTRNR:
12208 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12209 }
12210}
12211
12213 SelectionDAG &DAG) {
12214 // Check to see if we can use the TBL instruction.
12215 SDValue V1 = Op.getOperand(0);
12216 SDValue V2 = Op.getOperand(1);
12217 SDLoc DL(Op);
12218
12219 EVT EltVT = Op.getValueType().getVectorElementType();
12220 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12221
12222 bool Swap = false;
12223 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12224 std::swap(V1, V2);
12225 Swap = true;
12226 }
12227
12228 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12229 // out of range values with 0s. We do need to make sure that any out-of-range
12230 // values are really out-of-range for a v16i8 vector.
12231 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12232 MVT IndexVT = MVT::v8i8;
12233 unsigned IndexLen = 8;
12234 if (Op.getValueSizeInBits() == 128) {
12235 IndexVT = MVT::v16i8;
12236 IndexLen = 16;
12237 }
12238
12240 for (int Val : ShuffleMask) {
12241 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12242 unsigned Offset = Byte + Val * BytesPerElt;
12243 if (Swap)
12244 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12245 if (IsUndefOrZero && Offset >= IndexLen)
12246 Offset = 255;
12247 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12248 }
12249 }
12250
12251 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12252 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12253
12254 SDValue Shuffle;
12255 if (IsUndefOrZero) {
12256 if (IndexLen == 8)
12257 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12258 Shuffle = DAG.getNode(
12259 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12260 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12261 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12262 } else {
12263 if (IndexLen == 8) {
12264 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12265 Shuffle = DAG.getNode(
12266 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12267 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12268 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12269 } else {
12270 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12271 // cannot currently represent the register constraints on the input
12272 // table registers.
12273 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12274 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12275 // IndexLen));
12276 Shuffle = DAG.getNode(
12277 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12278 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12279 V2Cst,
12280 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12281 }
12282 }
12283 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12284}
12285
12286static unsigned getDUPLANEOp(EVT EltType) {
12287 if (EltType == MVT::i8)
12288 return AArch64ISD::DUPLANE8;
12289 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12290 return AArch64ISD::DUPLANE16;
12291 if (EltType == MVT::i32 || EltType == MVT::f32)
12292 return AArch64ISD::DUPLANE32;
12293 if (EltType == MVT::i64 || EltType == MVT::f64)
12294 return AArch64ISD::DUPLANE64;
12295
12296 llvm_unreachable("Invalid vector element type?");
12297}
12298
12299static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12300 unsigned Opcode, SelectionDAG &DAG) {
12301 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12302 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12303 // Match: dup (bitcast (extract_subv X, C)), LaneC
12304 if (BitCast.getOpcode() != ISD::BITCAST ||
12306 return false;
12307
12308 // The extract index must align in the destination type. That may not
12309 // happen if the bitcast is from narrow to wide type.
12310 SDValue Extract = BitCast.getOperand(0);
12311 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12312 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12313 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12314 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12315 if (ExtIdxInBits % CastedEltBitWidth != 0)
12316 return false;
12317
12318 // Can't handle cases where vector size is not 128-bit
12319 if (!Extract.getOperand(0).getValueType().is128BitVector())
12320 return false;
12321
12322 // Update the lane value by offsetting with the scaled extract index.
12323 LaneC += ExtIdxInBits / CastedEltBitWidth;
12324
12325 // Determine the casted vector type of the wide vector input.
12326 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12327 // Examples:
12328 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12329 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12330 unsigned SrcVecNumElts =
12331 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12333 SrcVecNumElts);
12334 return true;
12335 };
12336 MVT CastVT;
12337 if (getScaledOffsetDup(V, Lane, CastVT)) {
12338 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12339 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12340 V.getOperand(0).getValueType().is128BitVector()) {
12341 // The lane is incremented by the index of the extract.
12342 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12343 Lane += V.getConstantOperandVal(1);
12344 V = V.getOperand(0);
12345 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12346 // The lane is decremented if we are splatting from the 2nd operand.
12347 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12348 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12349 Lane -= Idx * VT.getVectorNumElements() / 2;
12350 V = WidenVector(V.getOperand(Idx), DAG);
12351 } else if (VT.getSizeInBits() == 64) {
12352 // Widen the operand to 128-bit register with undef.
12353 V = WidenVector(V, DAG);
12354 }
12355 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12356}
12357
12358// Return true if we can get a new shuffle mask by checking the parameter mask
12359// array to test whether every two adjacent mask values are continuous and
12360// starting from an even number.
12362 SmallVectorImpl<int> &NewMask) {
12363 unsigned NumElts = VT.getVectorNumElements();
12364 if (NumElts % 2 != 0)
12365 return false;
12366
12367 NewMask.clear();
12368 for (unsigned i = 0; i < NumElts; i += 2) {
12369 int M0 = M[i];
12370 int M1 = M[i + 1];
12371
12372 // If both elements are undef, new mask is undef too.
12373 if (M0 == -1 && M1 == -1) {
12374 NewMask.push_back(-1);
12375 continue;
12376 }
12377
12378 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12379 NewMask.push_back(M1 / 2);
12380 continue;
12381 }
12382
12383 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12384 NewMask.push_back(M0 / 2);
12385 continue;
12386 }
12387
12388 NewMask.clear();
12389 return false;
12390 }
12391
12392 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12393 return true;
12394}
12395
12396// Try to widen element type to get a new mask value for a better permutation
12397// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12398// UZP1/2, TRN1/2, REV, INS, etc.
12399// For example:
12400// shufflevector <4 x i32> %a, <4 x i32> %b,
12401// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12402// is equivalent to:
12403// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12404// Finally, we can get:
12405// mov v0.d[0], v1.d[1]
12407 SDLoc DL(Op);
12408 EVT VT = Op.getValueType();
12409 EVT ScalarVT = VT.getVectorElementType();
12410 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12411 SDValue V0 = Op.getOperand(0);
12412 SDValue V1 = Op.getOperand(1);
12413 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12414
12415 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12416 // We need to make sure the wider element type is legal. Thus, ElementSize
12417 // should be not larger than 32 bits, and i1 type should also be excluded.
12418 if (ElementSize > 32 || ElementSize == 1)
12419 return SDValue();
12420
12421 SmallVector<int, 8> NewMask;
12422 if (isWideTypeMask(Mask, VT, NewMask)) {
12423 MVT NewEltVT = VT.isFloatingPoint()
12424 ? MVT::getFloatingPointVT(ElementSize * 2)
12425 : MVT::getIntegerVT(ElementSize * 2);
12426 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12427 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12428 V0 = DAG.getBitcast(NewVT, V0);
12429 V1 = DAG.getBitcast(NewVT, V1);
12430 return DAG.getBitcast(VT,
12431 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12432 }
12433 }
12434
12435 return SDValue();
12436}
12437
12438// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12440 ArrayRef<int> ShuffleMask,
12441 SelectionDAG &DAG) {
12442 SDValue Tbl1 = Op->getOperand(0);
12443 SDValue Tbl2 = Op->getOperand(1);
12444 SDLoc dl(Op);
12445 SDValue Tbl2ID =
12446 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12447
12448 EVT VT = Op.getValueType();
12449 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12450 Tbl1->getOperand(0) != Tbl2ID ||
12452 Tbl2->getOperand(0) != Tbl2ID)
12453 return SDValue();
12454
12455 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12456 Tbl2->getValueType(0) != MVT::v16i8)
12457 return SDValue();
12458
12459 SDValue Mask1 = Tbl1->getOperand(3);
12460 SDValue Mask2 = Tbl2->getOperand(3);
12461 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12462 for (unsigned I = 0; I < 16; I++) {
12463 if (ShuffleMask[I] < 16)
12464 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12465 else {
12466 auto *C =
12467 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12468 if (!C)
12469 return SDValue();
12470 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12471 }
12472 }
12473
12474 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12475 SDValue ID =
12476 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12477
12478 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12479 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12480 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12481}
12482
12483// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12484// but we don't have an appropriate instruction,
12485// so custom-lower it as ZIP1-with-zeros.
12486SDValue
12487AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12488 SelectionDAG &DAG) const {
12489 SDLoc dl(Op);
12490 EVT VT = Op.getValueType();
12491 SDValue SrcOp = Op.getOperand(0);
12492 EVT SrcVT = SrcOp.getValueType();
12493 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12494 "Unexpected extension factor.");
12495 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12496 // FIXME: support multi-step zipping?
12497 if (Scale != 2)
12498 return SDValue();
12499 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12500 return DAG.getBitcast(VT,
12501 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12502}
12503
12504SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12505 SelectionDAG &DAG) const {
12506 SDLoc dl(Op);
12507 EVT VT = Op.getValueType();
12508
12509 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12510
12511 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12512 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12513
12514 // Convert shuffles that are directly supported on NEON to target-specific
12515 // DAG nodes, instead of keeping them as shuffles and matching them again
12516 // during code selection. This is more efficient and avoids the possibility
12517 // of inconsistencies between legalization and selection.
12518 ArrayRef<int> ShuffleMask = SVN->getMask();
12519
12520 SDValue V1 = Op.getOperand(0);
12521 SDValue V2 = Op.getOperand(1);
12522
12523 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12524 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12525 "Unexpected VECTOR_SHUFFLE mask size!");
12526
12527 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12528 return Res;
12529
12530 if (SVN->isSplat()) {
12531 int Lane = SVN->getSplatIndex();
12532 // If this is undef splat, generate it via "just" vdup, if possible.
12533 if (Lane == -1)
12534 Lane = 0;
12535
12536 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12537 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12538 V1.getOperand(0));
12539 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12540 // constant. If so, we can just reference the lane's definition directly.
12541 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12542 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12543 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12544
12545 // Otherwise, duplicate from the lane of the input vector.
12546 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12547 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12548 }
12549
12550 // Check if the mask matches a DUP for a wider element
12551 for (unsigned LaneSize : {64U, 32U, 16U}) {
12552 unsigned Lane = 0;
12553 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12554 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12555 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12557 // Cast V1 to an integer vector with required lane size
12558 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12559 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12560 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12561 V1 = DAG.getBitcast(NewVecTy, V1);
12562 // Constuct the DUP instruction
12563 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12564 // Cast back to the original type
12565 return DAG.getBitcast(VT, V1);
12566 }
12567 }
12568
12569 if (isREVMask(ShuffleMask, VT, 64))
12570 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12571 if (isREVMask(ShuffleMask, VT, 32))
12572 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12573 if (isREVMask(ShuffleMask, VT, 16))
12574 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12575
12576 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12577 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12578 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12579 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12580 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12581 DAG.getConstant(8, dl, MVT::i32));
12582 }
12583
12584 bool ReverseEXT = false;
12585 unsigned Imm;
12586 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12587 if (ReverseEXT)
12588 std::swap(V1, V2);
12589 Imm *= getExtFactor(V1);
12590 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12591 DAG.getConstant(Imm, dl, MVT::i32));
12592 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12593 Imm *= getExtFactor(V1);
12594 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12595 DAG.getConstant(Imm, dl, MVT::i32));
12596 }
12597
12598 unsigned WhichResult;
12599 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12600 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12601 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12602 }
12603 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12604 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12605 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12606 }
12607 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12608 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12609 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12610 }
12611
12612 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12613 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12614 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12615 }
12616 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12617 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12618 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12619 }
12620 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12621 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12622 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12623 }
12624
12626 return Concat;
12627
12628 bool DstIsLeft;
12629 int Anomaly;
12630 int NumInputElements = V1.getValueType().getVectorNumElements();
12631 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12632 SDValue DstVec = DstIsLeft ? V1 : V2;
12633 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12634
12635 SDValue SrcVec = V1;
12636 int SrcLane = ShuffleMask[Anomaly];
12637 if (SrcLane >= NumInputElements) {
12638 SrcVec = V2;
12639 SrcLane -= VT.getVectorNumElements();
12640 }
12641 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12642
12643 EVT ScalarVT = VT.getVectorElementType();
12644
12645 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12646 ScalarVT = MVT::i32;
12647
12648 return DAG.getNode(
12649 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12650 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12651 DstLaneV);
12652 }
12653
12654 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12655 return NewSD;
12656
12657 // If the shuffle is not directly supported and it has 4 elements, use
12658 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12659 unsigned NumElts = VT.getVectorNumElements();
12660 if (NumElts == 4) {
12661 unsigned PFIndexes[4];
12662 for (unsigned i = 0; i != 4; ++i) {
12663 if (ShuffleMask[i] < 0)
12664 PFIndexes[i] = 8;
12665 else
12666 PFIndexes[i] = ShuffleMask[i];
12667 }
12668
12669 // Compute the index in the perfect shuffle table.
12670 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12671 PFIndexes[2] * 9 + PFIndexes[3];
12672 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12673 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12674 dl);
12675 }
12676
12677 return GenerateTBL(Op, ShuffleMask, DAG);
12678}
12679
12680SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12681 SelectionDAG &DAG) const {
12682 EVT VT = Op.getValueType();
12683
12684 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12685 return LowerToScalableOp(Op, DAG);
12686
12687 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12688 "Unexpected vector type!");
12689
12690 // We can handle the constant cases during isel.
12691 if (isa<ConstantSDNode>(Op.getOperand(0)))
12692 return Op;
12693
12694 // There isn't a natural way to handle the general i1 case, so we use some
12695 // trickery with whilelo.
12696 SDLoc DL(Op);
12697 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12698 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12699 DAG.getValueType(MVT::i1));
12700 SDValue ID =
12701 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12702 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12703 if (VT == MVT::nxv1i1)
12704 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12705 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12706 Zero, SplatVal),
12707 Zero);
12708 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12709}
12710
12711SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12712 SelectionDAG &DAG) const {
12713 SDLoc DL(Op);
12714
12715 EVT VT = Op.getValueType();
12716 if (!isTypeLegal(VT) || !VT.isScalableVector())
12717 return SDValue();
12718
12719 // Current lowering only supports the SVE-ACLE types.
12721 return SDValue();
12722
12723 // The DUPQ operation is indepedent of element type so normalise to i64s.
12724 SDValue Idx128 = Op.getOperand(2);
12725
12726 // DUPQ can be used when idx is in range.
12727 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12728 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12729 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12730 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12731 }
12732
12733 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12734
12735 // The ACLE says this must produce the same result as:
12736 // svtbl(data, svadd_x(svptrue_b64(),
12737 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12738 // index * 2))
12739 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12740 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12741
12742 // create the vector 0,1,0,1,...
12743 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12744 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12745
12746 // create the vector idx64,idx64+1,idx64,idx64+1,...
12747 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12748 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12749 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12750
12751 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12752 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12753 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12754}
12755
12756
12757static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12758 APInt &UndefBits) {
12759 EVT VT = BVN->getValueType(0);
12760 APInt SplatBits, SplatUndef;
12761 unsigned SplatBitSize;
12762 bool HasAnyUndefs;
12763 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12764 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12765
12766 for (unsigned i = 0; i < NumSplats; ++i) {
12767 CnstBits <<= SplatBitSize;
12768 UndefBits <<= SplatBitSize;
12769 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12770 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12771 }
12772
12773 return true;
12774 }
12775
12776 return false;
12777}
12778
12779// Try 64-bit splatted SIMD immediate.
12780static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12781 const APInt &Bits) {
12782 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12783 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12784 EVT VT = Op.getValueType();
12785 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12786
12789
12790 SDLoc dl(Op);
12791 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12792 DAG.getConstant(Value, dl, MVT::i32));
12793 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12794 }
12795 }
12796
12797 return SDValue();
12798}
12799
12800// Try 32-bit splatted SIMD immediate.
12801static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12802 const APInt &Bits,
12803 const SDValue *LHS = nullptr) {
12804 EVT VT = Op.getValueType();
12805 if (VT.isFixedLengthVector() &&
12807 return SDValue();
12808
12809 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12810 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12811 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12812 bool isAdvSIMDModImm = false;
12813 uint64_t Shift;
12814
12815 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12817 Shift = 0;
12818 }
12819 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12821 Shift = 8;
12822 }
12823 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12825 Shift = 16;
12826 }
12827 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12829 Shift = 24;
12830 }
12831
12832 if (isAdvSIMDModImm) {
12833 SDLoc dl(Op);
12834 SDValue Mov;
12835
12836 if (LHS)
12837 Mov = DAG.getNode(NewOp, dl, MovTy,
12838 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12839 DAG.getConstant(Value, dl, MVT::i32),
12840 DAG.getConstant(Shift, dl, MVT::i32));
12841 else
12842 Mov = DAG.getNode(NewOp, dl, MovTy,
12843 DAG.getConstant(Value, dl, MVT::i32),
12844 DAG.getConstant(Shift, dl, MVT::i32));
12845
12846 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12847 }
12848 }
12849
12850 return SDValue();
12851}
12852
12853// Try 16-bit splatted SIMD immediate.
12854static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12855 const APInt &Bits,
12856 const SDValue *LHS = nullptr) {
12857 EVT VT = Op.getValueType();
12858 if (VT.isFixedLengthVector() &&
12860 return SDValue();
12861
12862 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12863 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12864 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12865 bool isAdvSIMDModImm = false;
12866 uint64_t Shift;
12867
12868 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12870 Shift = 0;
12871 }
12872 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12874 Shift = 8;
12875 }
12876
12877 if (isAdvSIMDModImm) {
12878 SDLoc dl(Op);
12879 SDValue Mov;
12880
12881 if (LHS)
12882 Mov = DAG.getNode(NewOp, dl, MovTy,
12883 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12884 DAG.getConstant(Value, dl, MVT::i32),
12885 DAG.getConstant(Shift, dl, MVT::i32));
12886 else
12887 Mov = DAG.getNode(NewOp, dl, MovTy,
12888 DAG.getConstant(Value, dl, MVT::i32),
12889 DAG.getConstant(Shift, dl, MVT::i32));
12890
12891 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12892 }
12893 }
12894
12895 return SDValue();
12896}
12897
12898// Try 32-bit splatted SIMD immediate with shifted ones.
12900 SelectionDAG &DAG, const APInt &Bits) {
12901 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12902 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12903 EVT VT = Op.getValueType();
12904 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12905 bool isAdvSIMDModImm = false;
12906 uint64_t Shift;
12907
12908 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12910 Shift = 264;
12911 }
12912 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12914 Shift = 272;
12915 }
12916
12917 if (isAdvSIMDModImm) {
12918 SDLoc dl(Op);
12919 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12920 DAG.getConstant(Value, dl, MVT::i32),
12921 DAG.getConstant(Shift, dl, MVT::i32));
12922 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12923 }
12924 }
12925
12926 return SDValue();
12927}
12928
12929// Try 8-bit splatted SIMD immediate.
12930static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12931 const APInt &Bits) {
12932 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12933 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12934 EVT VT = Op.getValueType();
12935 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12936
12939
12940 SDLoc dl(Op);
12941 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12942 DAG.getConstant(Value, dl, MVT::i32));
12943 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12944 }
12945 }
12946
12947 return SDValue();
12948}
12949
12950// Try FP splatted SIMD immediate.
12951static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12952 const APInt &Bits) {
12953 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12954 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12955 EVT VT = Op.getValueType();
12956 bool isWide = (VT.getSizeInBits() == 128);
12957 MVT MovTy;
12958 bool isAdvSIMDModImm = false;
12959
12960 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12962 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12963 }
12964 else if (isWide &&
12965 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12967 MovTy = MVT::v2f64;
12968 }
12969
12970 if (isAdvSIMDModImm) {
12971 SDLoc dl(Op);
12972 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12973 DAG.getConstant(Value, dl, MVT::i32));
12974 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12975 }
12976 }
12977
12978 return SDValue();
12979}
12980
12981// Specialized code to quickly find if PotentialBVec is a BuildVector that
12982// consists of only the same constant int value, returned in reference arg
12983// ConstVal
12984static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12985 uint64_t &ConstVal) {
12986 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
12987 if (!Bvec)
12988 return false;
12989 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
12990 if (!FirstElt)
12991 return false;
12992 EVT VT = Bvec->getValueType(0);
12993 unsigned NumElts = VT.getVectorNumElements();
12994 for (unsigned i = 1; i < NumElts; ++i)
12995 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
12996 return false;
12997 ConstVal = FirstElt->getZExtValue();
12998 return true;
12999}
13000
13002 // Look through cast.
13003 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13004 N = N.getOperand(0);
13005
13006 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13007}
13008
13010 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13011
13012 // Look through cast.
13013 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13014 N = N.getOperand(0);
13015 // When reinterpreting from a type with fewer elements the "new" elements
13016 // are not active, so bail if they're likely to be used.
13017 if (N.getValueType().getVectorMinNumElements() < NumElts)
13018 return false;
13019 }
13020
13021 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13022 return true;
13023
13024 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13025 // or smaller than the implicit element type represented by N.
13026 // NOTE: A larger element count implies a smaller element type.
13027 if (N.getOpcode() == AArch64ISD::PTRUE &&
13028 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13029 return N.getValueType().getVectorMinNumElements() >= NumElts;
13030
13031 // If we're compiling for a specific vector-length, we can check if the
13032 // pattern's VL equals that of the scalable vector at runtime.
13033 if (N.getOpcode() == AArch64ISD::PTRUE) {
13034 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13035 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13036 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13037 if (MaxSVESize && MinSVESize == MaxSVESize) {
13038 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13039 unsigned PatNumElts =
13040 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13041 return PatNumElts == (NumElts * VScale);
13042 }
13043 }
13044
13045 return false;
13046}
13047
13048// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13049// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13050// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13051// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13052// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13053// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13055 EVT VT = N->getValueType(0);
13056
13057 if (!VT.isVector())
13058 return SDValue();
13059
13060 SDLoc DL(N);
13061
13062 SDValue And;
13063 SDValue Shift;
13064
13065 SDValue FirstOp = N->getOperand(0);
13066 unsigned FirstOpc = FirstOp.getOpcode();
13067 SDValue SecondOp = N->getOperand(1);
13068 unsigned SecondOpc = SecondOp.getOpcode();
13069
13070 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13071 // a BICi in order to use an immediate instead of a register.
13072 // Is the other operand an shl or lshr? This will have been turned into:
13073 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13074 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13075 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13076 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13077 SecondOpc == AArch64ISD::SHL_PRED ||
13078 SecondOpc == AArch64ISD::SRL_PRED)) {
13079 And = FirstOp;
13080 Shift = SecondOp;
13081
13082 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13083 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13084 FirstOpc == AArch64ISD::SHL_PRED ||
13085 FirstOpc == AArch64ISD::SRL_PRED)) {
13086 And = SecondOp;
13087 Shift = FirstOp;
13088 } else
13089 return SDValue();
13090
13091 bool IsAnd = And.getOpcode() == ISD::AND;
13092 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13094 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13096
13097 // Is the shift amount constant and are all lanes active?
13098 uint64_t C2;
13099 if (ShiftHasPredOp) {
13100 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13101 return SDValue();
13102 APInt C;
13104 return SDValue();
13105 C2 = C.getZExtValue();
13106 } else if (ConstantSDNode *C2node =
13107 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13108 C2 = C2node->getZExtValue();
13109 else
13110 return SDValue();
13111
13112 APInt C1AsAPInt;
13113 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13114 if (IsAnd) {
13115 // Is the and mask vector all constant?
13116 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13117 return SDValue();
13118 } else {
13119 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13120 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13121 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13122 assert(C1nodeImm && C1nodeShift);
13123 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13124 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13125 }
13126
13127 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13128 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13129 // how much one can shift elements of a particular size?
13130 if (C2 > ElemSizeInBits)
13131 return SDValue();
13132
13133 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13134 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13135 if (C1AsAPInt != RequiredC1)
13136 return SDValue();
13137
13138 SDValue X = And.getOperand(0);
13139 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13140 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13141 : Shift.getOperand(1);
13142
13143 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13144 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13145
13146 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13147 LLVM_DEBUG(N->dump(&DAG));
13148 LLVM_DEBUG(dbgs() << "into: \n");
13149 LLVM_DEBUG(ResultSLI->dump(&DAG));
13150
13151 ++NumShiftInserts;
13152 return ResultSLI;
13153}
13154
13155SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13156 SelectionDAG &DAG) const {
13157 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13158 !Subtarget->isNeonAvailable()))
13159 return LowerToScalableOp(Op, DAG);
13160
13161 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13162 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13163 return Res;
13164
13165 EVT VT = Op.getValueType();
13166 if (VT.isScalableVector())
13167 return Op;
13168
13169 SDValue LHS = Op.getOperand(0);
13170 BuildVectorSDNode *BVN =
13171 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13172 if (!BVN) {
13173 // OR commutes, so try swapping the operands.
13174 LHS = Op.getOperand(1);
13175 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13176 }
13177 if (!BVN)
13178 return Op;
13179
13180 APInt DefBits(VT.getSizeInBits(), 0);
13181 APInt UndefBits(VT.getSizeInBits(), 0);
13182 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13183 SDValue NewOp;
13184
13185 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13186 DefBits, &LHS)) ||
13187 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13188 DefBits, &LHS)))
13189 return NewOp;
13190
13191 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13192 UndefBits, &LHS)) ||
13193 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13194 UndefBits, &LHS)))
13195 return NewOp;
13196 }
13197
13198 // We can always fall back to a non-immediate OR.
13199 return Op;
13200}
13201
13202// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13203// be truncated to fit element width.
13205 SelectionDAG &DAG) {
13206 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13207 SDLoc dl(Op);
13208 EVT VT = Op.getValueType();
13209 EVT EltTy= VT.getVectorElementType();
13210
13211 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13212 return Op;
13213
13215 for (SDValue Lane : Op->ops()) {
13216 // For integer vectors, type legalization would have promoted the
13217 // operands already. Otherwise, if Op is a floating-point splat
13218 // (with operands cast to integers), then the only possibilities
13219 // are constants and UNDEFs.
13220 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13221 APInt LowBits(EltTy.getSizeInBits(),
13222 CstLane->getZExtValue());
13223 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13224 } else if (Lane.getNode()->isUndef()) {
13225 Lane = DAG.getUNDEF(MVT::i32);
13226 } else {
13227 assert(Lane.getValueType() == MVT::i32 &&
13228 "Unexpected BUILD_VECTOR operand type");
13229 }
13230 Ops.push_back(Lane);
13231 }
13232 return DAG.getBuildVector(VT, dl, Ops);
13233}
13234
13236 const AArch64Subtarget *ST) {
13237 EVT VT = Op.getValueType();
13238 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13239 "Expected a legal NEON vector");
13240
13241 APInt DefBits(VT.getSizeInBits(), 0);
13242 APInt UndefBits(VT.getSizeInBits(), 0);
13243 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13244 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13245 auto TryMOVIWithBits = [&](APInt DefBits) {
13246 SDValue NewOp;
13247 if ((NewOp =
13248 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13249 (NewOp =
13250 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13251 (NewOp =
13252 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13253 (NewOp =
13254 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13255 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13256 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13257 return NewOp;
13258
13259 APInt NotDefBits = ~DefBits;
13260 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13261 NotDefBits)) ||
13263 NotDefBits)) ||
13264 (NewOp =
13265 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13266 return NewOp;
13267 return SDValue();
13268 };
13269 if (SDValue R = TryMOVIWithBits(DefBits))
13270 return R;
13271 if (SDValue R = TryMOVIWithBits(UndefBits))
13272 return R;
13273
13274 // See if a fneg of the constant can be materialized with a MOVI, etc
13275 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13276 // FNegate each sub-element of the constant
13277 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13278 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13279 .zext(VT.getSizeInBits());
13280 APInt NegBits(VT.getSizeInBits(), 0);
13281 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13282 for (unsigned i = 0; i < NumElts; i++)
13283 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13284 NegBits = DefBits ^ NegBits;
13285
13286 // Try to create the new constants with MOVI, and if so generate a fneg
13287 // for it.
13288 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13289 SDLoc DL(Op);
13290 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13291 return DAG.getNode(
13293 DAG.getNode(ISD::FNEG, DL, VFVT,
13294 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13295 }
13296 return SDValue();
13297 };
13298 SDValue R;
13299 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13300 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13301 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13302 return R;
13303 }
13304
13305 return SDValue();
13306}
13307
13308SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13309 SelectionDAG &DAG) const {
13310 EVT VT = Op.getValueType();
13311
13312 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13313 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13314 SDLoc DL(Op);
13315 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13316 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13317 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13318 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13319 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13320 }
13321
13322 // Revert to common legalisation for all other variants.
13323 return SDValue();
13324 }
13325
13326 // Try to build a simple constant vector.
13327 Op = NormalizeBuildVector(Op, DAG);
13328 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13329 // abort.
13330 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13331 return SDValue();
13332
13333 // Certain vector constants, used to express things like logical NOT and
13334 // arithmetic NEG, are passed through unmodified. This allows special
13335 // patterns for these operations to match, which will lower these constants
13336 // to whatever is proven necessary.
13337 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13338 if (BVN->isConstant()) {
13339 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13340 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13341 APInt Val(BitSize,
13342 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13343 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13344 return Op;
13345 }
13346 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13347 if (Const->isZero() && !Const->isNegative())
13348 return Op;
13349 }
13350
13351 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13352 return V;
13353
13354 // Scan through the operands to find some interesting properties we can
13355 // exploit:
13356 // 1) If only one value is used, we can use a DUP, or
13357 // 2) if only the low element is not undef, we can just insert that, or
13358 // 3) if only one constant value is used (w/ some non-constant lanes),
13359 // we can splat the constant value into the whole vector then fill
13360 // in the non-constant lanes.
13361 // 4) FIXME: If different constant values are used, but we can intelligently
13362 // select the values we'll be overwriting for the non-constant
13363 // lanes such that we can directly materialize the vector
13364 // some other way (MOVI, e.g.), we can be sneaky.
13365 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13366 SDLoc dl(Op);
13367 unsigned NumElts = VT.getVectorNumElements();
13368 bool isOnlyLowElement = true;
13369 bool usesOnlyOneValue = true;
13370 bool usesOnlyOneConstantValue = true;
13371 bool isConstant = true;
13372 bool AllLanesExtractElt = true;
13373 unsigned NumConstantLanes = 0;
13374 unsigned NumDifferentLanes = 0;
13375 unsigned NumUndefLanes = 0;
13376 SDValue Value;
13377 SDValue ConstantValue;
13378 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13379 unsigned ConsecutiveValCount = 0;
13380 SDValue PrevVal;
13381 for (unsigned i = 0; i < NumElts; ++i) {
13382 SDValue V = Op.getOperand(i);
13383 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13384 AllLanesExtractElt = false;
13385 if (V.isUndef()) {
13386 ++NumUndefLanes;
13387 continue;
13388 }
13389 if (i > 0)
13390 isOnlyLowElement = false;
13391 if (!isIntOrFPConstant(V))
13392 isConstant = false;
13393
13394 if (isIntOrFPConstant(V)) {
13395 ++NumConstantLanes;
13396 if (!ConstantValue.getNode())
13397 ConstantValue = V;
13398 else if (ConstantValue != V)
13399 usesOnlyOneConstantValue = false;
13400 }
13401
13402 if (!Value.getNode())
13403 Value = V;
13404 else if (V != Value) {
13405 usesOnlyOneValue = false;
13406 ++NumDifferentLanes;
13407 }
13408
13409 if (PrevVal != V) {
13410 ConsecutiveValCount = 0;
13411 PrevVal = V;
13412 }
13413
13414 // Keep different values and its last consecutive count. For example,
13415 //
13416 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13417 // t24, t24, t24, t24, t24, t24, t24, t24
13418 // t23 = consecutive count 8
13419 // t24 = consecutive count 8
13420 // ------------------------------------------------------------------
13421 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13422 // t24, t24, t24, t24, t24, t24, t24, t24
13423 // t23 = consecutive count 5
13424 // t24 = consecutive count 9
13425 DifferentValueMap[V] = ++ConsecutiveValCount;
13426 }
13427
13428 if (!Value.getNode()) {
13429 LLVM_DEBUG(
13430 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13431 return DAG.getUNDEF(VT);
13432 }
13433
13434 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13435 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13436 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13437 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13438 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13439 "SCALAR_TO_VECTOR node\n");
13440 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13441 }
13442
13443 if (AllLanesExtractElt) {
13444 SDNode *Vector = nullptr;
13445 bool Even = false;
13446 bool Odd = false;
13447 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13448 // the Odd pattern <1,3,5,...>.
13449 for (unsigned i = 0; i < NumElts; ++i) {
13450 SDValue V = Op.getOperand(i);
13451 const SDNode *N = V.getNode();
13452 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13453 Even = false;
13454 Odd = false;
13455 break;
13456 }
13457 SDValue N0 = N->getOperand(0);
13458
13459 // All elements are extracted from the same vector.
13460 if (!Vector) {
13461 Vector = N0.getNode();
13462 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13463 // BUILD_VECTOR.
13464 if (VT.getVectorElementType() !=
13466 break;
13467 } else if (Vector != N0.getNode()) {
13468 Odd = false;
13469 Even = false;
13470 break;
13471 }
13472
13473 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13474 // indices <1,3,5,...>.
13475 uint64_t Val = N->getConstantOperandVal(1);
13476 if (Val == 2 * i) {
13477 Even = true;
13478 continue;
13479 }
13480 if (Val - 1 == 2 * i) {
13481 Odd = true;
13482 continue;
13483 }
13484
13485 // Something does not match: abort.
13486 Odd = false;
13487 Even = false;
13488 break;
13489 }
13490 if (Even || Odd) {
13491 SDValue LHS =
13493 DAG.getConstant(0, dl, MVT::i64));
13494 SDValue RHS =
13496 DAG.getConstant(NumElts, dl, MVT::i64));
13497
13498 if (Even && !Odd)
13499 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13500 RHS);
13501 if (Odd && !Even)
13502 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13503 RHS);
13504 }
13505 }
13506
13507 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13508 // i32 and try again.
13509 if (usesOnlyOneValue) {
13510 if (!isConstant) {
13511 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13512 Value.getValueType() != VT) {
13513 LLVM_DEBUG(
13514 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13515 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13516 }
13517
13518 // This is actually a DUPLANExx operation, which keeps everything vectory.
13519
13520 SDValue Lane = Value.getOperand(1);
13521 Value = Value.getOperand(0);
13522 if (Value.getValueSizeInBits() == 64) {
13523 LLVM_DEBUG(
13524 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13525 "widening it\n");
13526 Value = WidenVector(Value, DAG);
13527 }
13528
13529 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13530 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13531 }
13532
13535 EVT EltTy = VT.getVectorElementType();
13536 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13537 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13538 LLVM_DEBUG(
13539 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13540 "BITCASTS, and try again\n");
13541 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13542 for (unsigned i = 0; i < NumElts; ++i)
13543 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13544 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13545 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13546 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13547 Val.dump(););
13548 Val = LowerBUILD_VECTOR(Val, DAG);
13549 if (Val.getNode())
13550 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13551 }
13552 }
13553
13554 // If we need to insert a small number of different non-constant elements and
13555 // the vector width is sufficiently large, prefer using DUP with the common
13556 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13557 // skip the constant lane handling below.
13558 bool PreferDUPAndInsert =
13559 !isConstant && NumDifferentLanes >= 1 &&
13560 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13561 NumDifferentLanes >= NumConstantLanes;
13562
13563 // If there was only one constant value used and for more than one lane,
13564 // start by splatting that value, then replace the non-constant lanes. This
13565 // is better than the default, which will perform a separate initialization
13566 // for each lane.
13567 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13568 // Firstly, try to materialize the splat constant.
13569 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13570 unsigned BitSize = VT.getScalarSizeInBits();
13571 APInt ConstantValueAPInt(1, 0);
13572 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13573 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13574 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13575 !ConstantValueAPInt.isAllOnes()) {
13576 Val = ConstantBuildVector(Val, DAG, Subtarget);
13577 if (!Val)
13578 // Otherwise, materialize the constant and splat it.
13579 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13580 }
13581
13582 // Now insert the non-constant lanes.
13583 for (unsigned i = 0; i < NumElts; ++i) {
13584 SDValue V = Op.getOperand(i);
13585 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13586 if (!isIntOrFPConstant(V))
13587 // Note that type legalization likely mucked about with the VT of the
13588 // source operand, so we may have to convert it here before inserting.
13589 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13590 }
13591 return Val;
13592 }
13593
13594 // This will generate a load from the constant pool.
13595 if (isConstant) {
13596 LLVM_DEBUG(
13597 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13598 "expansion\n");
13599 return SDValue();
13600 }
13601
13602 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13603 // v4i32s. This is really a truncate, which we can construct out of (legal)
13604 // concats and truncate nodes.
13606 return M;
13607
13608 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13609 if (NumElts >= 4) {
13610 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13611 return Shuffle;
13612
13613 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13614 return Shuffle;
13615 }
13616
13617 if (PreferDUPAndInsert) {
13618 // First, build a constant vector with the common element.
13619 SmallVector<SDValue, 8> Ops(NumElts, Value);
13620 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13621 // Next, insert the elements that do not match the common value.
13622 for (unsigned I = 0; I < NumElts; ++I)
13623 if (Op.getOperand(I) != Value)
13624 NewVector =
13625 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13626 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13627
13628 return NewVector;
13629 }
13630
13631 // If vector consists of two different values, try to generate two DUPs and
13632 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13633 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13635 // Check the consecutive count of the value is the half number of vector
13636 // elements. In this case, we can use CONCAT_VECTORS. For example,
13637 //
13638 // canUseVECTOR_CONCAT = true;
13639 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13640 // t24, t24, t24, t24, t24, t24, t24, t24
13641 //
13642 // canUseVECTOR_CONCAT = false;
13643 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13644 // t24, t24, t24, t24, t24, t24, t24, t24
13645 bool canUseVECTOR_CONCAT = true;
13646 for (auto Pair : DifferentValueMap) {
13647 // Check different values have same length which is NumElts / 2.
13648 if (Pair.second != NumElts / 2)
13649 canUseVECTOR_CONCAT = false;
13650 Vals.push_back(Pair.first);
13651 }
13652
13653 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13654 // CONCAT_VECTORs. For example,
13655 //
13656 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13657 // t24, t24, t24, t24, t24, t24, t24, t24
13658 // ==>
13659 // t26: v8i8 = AArch64ISD::DUP t23
13660 // t28: v8i8 = AArch64ISD::DUP t24
13661 // t29: v16i8 = concat_vectors t26, t28
13662 if (canUseVECTOR_CONCAT) {
13663 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13664 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13665 SubVT.getVectorNumElements() >= 2) {
13666 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13667 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13668 SDValue DUP1 =
13669 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13670 SDValue DUP2 =
13671 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13673 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13674 return CONCAT_VECTORS;
13675 }
13676 }
13677
13678 // Let's try to generate VECTOR_SHUFFLE. For example,
13679 //
13680 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13681 // ==>
13682 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13683 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13684 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13685 if (NumElts >= 8) {
13686 SmallVector<int, 16> MaskVec;
13687 // Build mask for VECTOR_SHUFLLE.
13688 SDValue FirstLaneVal = Op.getOperand(0);
13689 for (unsigned i = 0; i < NumElts; ++i) {
13690 SDValue Val = Op.getOperand(i);
13691 if (FirstLaneVal == Val)
13692 MaskVec.push_back(i);
13693 else
13694 MaskVec.push_back(i + NumElts);
13695 }
13696
13697 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13698 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13699 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13700 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13702 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13703 return VECTOR_SHUFFLE;
13704 }
13705 }
13706
13707 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13708 // know the default expansion would otherwise fall back on something even
13709 // worse. For a vector with one or two non-undef values, that's
13710 // scalar_to_vector for the elements followed by a shuffle (provided the
13711 // shuffle is valid for the target) and materialization element by element
13712 // on the stack followed by a load for everything else.
13713 if (!isConstant && !usesOnlyOneValue) {
13714 LLVM_DEBUG(
13715 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13716 "of INSERT_VECTOR_ELT\n");
13717
13718 SDValue Vec = DAG.getUNDEF(VT);
13719 SDValue Op0 = Op.getOperand(0);
13720 unsigned i = 0;
13721
13722 // Use SCALAR_TO_VECTOR for lane zero to
13723 // a) Avoid a RMW dependency on the full vector register, and
13724 // b) Allow the register coalescer to fold away the copy if the
13725 // value is already in an S or D register, and we're forced to emit an
13726 // INSERT_SUBREG that we can't fold anywhere.
13727 //
13728 // We also allow types like i8 and i16 which are illegal scalar but legal
13729 // vector element types. After type-legalization the inserted value is
13730 // extended (i32) and it is safe to cast them to the vector type by ignoring
13731 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13732 if (!Op0.isUndef()) {
13733 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13734 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13735 ++i;
13736 }
13737 LLVM_DEBUG(if (i < NumElts) dbgs()
13738 << "Creating nodes for the other vector elements:\n";);
13739 for (; i < NumElts; ++i) {
13740 SDValue V = Op.getOperand(i);
13741 if (V.isUndef())
13742 continue;
13743 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13744 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13745 }
13746 return Vec;
13747 }
13748
13749 LLVM_DEBUG(
13750 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13751 "better alternative\n");
13752 return SDValue();
13753}
13754
13755SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13756 SelectionDAG &DAG) const {
13757 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13758 !Subtarget->isNeonAvailable()))
13759 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13760
13761 assert(Op.getValueType().isScalableVector() &&
13762 isTypeLegal(Op.getValueType()) &&
13763 "Expected legal scalable vector type!");
13764
13765 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13766 unsigned NumOperands = Op->getNumOperands();
13767 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13768 "Unexpected number of operands in CONCAT_VECTORS");
13769
13770 if (NumOperands == 2)
13771 return Op;
13772
13773 // Concat each pair of subvectors and pack into the lower half of the array.
13774 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13775 while (ConcatOps.size() > 1) {
13776 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13777 SDValue V1 = ConcatOps[I];
13778 SDValue V2 = ConcatOps[I + 1];
13779 EVT SubVT = V1.getValueType();
13780 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13781 ConcatOps[I / 2] =
13782 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13783 }
13784 ConcatOps.resize(ConcatOps.size() / 2);
13785 }
13786 return ConcatOps[0];
13787 }
13788
13789 return SDValue();
13790}
13791
13792SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13793 SelectionDAG &DAG) const {
13794 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13795
13796 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13797 !Subtarget->isNeonAvailable()))
13798 return LowerFixedLengthInsertVectorElt(Op, DAG);
13799
13800 EVT VT = Op.getOperand(0).getValueType();
13801
13802 if (VT.getScalarType() == MVT::i1) {
13803 EVT VectorVT = getPromotedVTForPredicate(VT);
13804 SDLoc DL(Op);
13805 SDValue ExtendedVector =
13806 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13807 SDValue ExtendedValue =
13808 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13809 VectorVT.getScalarType().getSizeInBits() < 32
13810 ? MVT::i32
13811 : VectorVT.getScalarType());
13812 ExtendedVector =
13813 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13814 ExtendedValue, Op.getOperand(2));
13815 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13816 }
13817
13818 // Check for non-constant or out of range lane.
13819 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13820 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13821 return SDValue();
13822
13823 return Op;
13824}
13825
13826SDValue
13827AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13828 SelectionDAG &DAG) const {
13829 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13830 EVT VT = Op.getOperand(0).getValueType();
13831
13832 if (VT.getScalarType() == MVT::i1) {
13833 // We can't directly extract from an SVE predicate; extend it first.
13834 // (This isn't the only possible lowering, but it's straightforward.)
13835 EVT VectorVT = getPromotedVTForPredicate(VT);
13836 SDLoc DL(Op);
13837 SDValue Extend =
13838 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13839 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13840 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13841 Extend, Op.getOperand(1));
13842 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13843 }
13844
13845 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13846 return LowerFixedLengthExtractVectorElt(Op, DAG);
13847
13848 // Check for non-constant or out of range lane.
13849 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13850 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13851 return SDValue();
13852
13853 // Insertion/extraction are legal for V128 types.
13854 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13855 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13856 VT == MVT::v8f16 || VT == MVT::v8bf16)
13857 return Op;
13858
13859 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13860 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13861 VT != MVT::v4bf16)
13862 return SDValue();
13863
13864 // For V64 types, we perform extraction by expanding the value
13865 // to a V128 type and perform the extraction on that.
13866 SDLoc DL(Op);
13867 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13868 EVT WideTy = WideVec.getValueType();
13869
13870 EVT ExtrTy = WideTy.getVectorElementType();
13871 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13872 ExtrTy = MVT::i32;
13873
13874 // For extractions, we just return the result directly.
13875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13876 Op.getOperand(1));
13877}
13878
13879SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13880 SelectionDAG &DAG) const {
13881 assert(Op.getValueType().isFixedLengthVector() &&
13882 "Only cases that extract a fixed length vector are supported!");
13883
13884 EVT InVT = Op.getOperand(0).getValueType();
13885 unsigned Idx = Op.getConstantOperandVal(1);
13886 unsigned Size = Op.getValueSizeInBits();
13887
13888 // If we don't have legal types yet, do nothing
13889 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13890 return SDValue();
13891
13892 if (InVT.isScalableVector()) {
13893 // This will be matched by custom code during ISelDAGToDAG.
13894 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13895 return Op;
13896
13897 return SDValue();
13898 }
13899
13900 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13901 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13902 return Op;
13903
13904 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13905 // that directly.
13906 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13907 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13908 return Op;
13909
13910 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13911 SDLoc DL(Op);
13912
13913 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13914 SDValue NewInVec =
13915 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13916
13917 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13918 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13919 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13920 }
13921
13922 return SDValue();
13923}
13924
13925SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13926 SelectionDAG &DAG) const {
13927 assert(Op.getValueType().isScalableVector() &&
13928 "Only expect to lower inserts into scalable vectors!");
13929
13930 EVT InVT = Op.getOperand(1).getValueType();
13931 unsigned Idx = Op.getConstantOperandVal(2);
13932
13933 SDValue Vec0 = Op.getOperand(0);
13934 SDValue Vec1 = Op.getOperand(1);
13935 SDLoc DL(Op);
13936 EVT VT = Op.getValueType();
13937
13938 if (InVT.isScalableVector()) {
13939 if (!isTypeLegal(VT))
13940 return SDValue();
13941
13942 // Break down insert_subvector into simpler parts.
13943 if (VT.getVectorElementType() == MVT::i1) {
13944 unsigned NumElts = VT.getVectorMinNumElements();
13945 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13946
13947 SDValue Lo, Hi;
13948 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13949 DAG.getVectorIdxConstant(0, DL));
13950 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13951 DAG.getVectorIdxConstant(NumElts / 2, DL));
13952 if (Idx < (NumElts / 2))
13953 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13955 else
13956 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13957 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13958
13959 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13960 }
13961
13962 // Ensure the subvector is half the size of the main vector.
13963 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13964 return SDValue();
13965
13966 // Here narrow and wide refers to the vector element types. After "casting"
13967 // both vectors must have the same bit length and so because the subvector
13968 // has fewer elements, those elements need to be bigger.
13971
13972 // NOP cast operands to the largest legal vector of the same element count.
13973 if (VT.isFloatingPoint()) {
13974 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13975 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13976 } else {
13977 // Legal integer vectors are already their largest so Vec0 is fine as is.
13978 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13979 }
13980
13981 // To replace the top/bottom half of vector V with vector SubV we widen the
13982 // preserved half of V, concatenate this to SubV (the order depending on the
13983 // half being replaced) and then narrow the result.
13984 SDValue Narrow;
13985 if (Idx == 0) {
13986 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
13987 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
13988 } else {
13990 "Invalid subvector index!");
13991 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
13992 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
13993 }
13994
13995 return getSVESafeBitCast(VT, Narrow, DAG);
13996 }
13997
13998 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
13999 // This will be matched by custom code during ISelDAGToDAG.
14000 if (Vec0.isUndef())
14001 return Op;
14002
14003 std::optional<unsigned> PredPattern =
14005 auto PredTy = VT.changeVectorElementType(MVT::i1);
14006 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14007 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14008 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14009 }
14010
14011 return SDValue();
14012}
14013
14014static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14015 if (Op.getOpcode() != AArch64ISD::DUP &&
14016 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14017 Op.getOpcode() != ISD::BUILD_VECTOR)
14018 return false;
14019
14020 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14021 !isAllConstantBuildVector(Op, SplatVal))
14022 return false;
14023
14024 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14025 !isa<ConstantSDNode>(Op->getOperand(0)))
14026 return false;
14027
14028 SplatVal = Op->getConstantOperandVal(0);
14029 if (Op.getValueType().getVectorElementType() != MVT::i64)
14030 SplatVal = (int32_t)SplatVal;
14031
14032 Negated = false;
14033 if (isPowerOf2_64(SplatVal))
14034 return true;
14035
14036 Negated = true;
14037 if (isPowerOf2_64(-SplatVal)) {
14038 SplatVal = -SplatVal;
14039 return true;
14040 }
14041
14042 return false;
14043}
14044
14045SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14046 EVT VT = Op.getValueType();
14047 SDLoc dl(Op);
14048
14049 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14050 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14051
14052 assert(VT.isScalableVector() && "Expected a scalable vector.");
14053
14054 bool Signed = Op.getOpcode() == ISD::SDIV;
14055 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14056
14057 bool Negated;
14058 uint64_t SplatVal;
14059 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14060 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14061 SDValue Res =
14062 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14063 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14064 if (Negated)
14065 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14066
14067 return Res;
14068 }
14069
14070 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14071 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14072
14073 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14074 // operations, and truncate the result.
14075 EVT WidenedVT;
14076 if (VT == MVT::nxv16i8)
14077 WidenedVT = MVT::nxv8i16;
14078 else if (VT == MVT::nxv8i16)
14079 WidenedVT = MVT::nxv4i32;
14080 else
14081 llvm_unreachable("Unexpected Custom DIV operation");
14082
14083 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14084 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14085 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14086 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14087 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14088 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14089 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14090 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14091 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14092}
14093
14095 // Currently no fixed length shuffles that require SVE are legal.
14096 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14097 return false;
14098
14099 if (VT.getVectorNumElements() == 4 &&
14100 (VT.is128BitVector() || VT.is64BitVector())) {
14101 unsigned Cost = getPerfectShuffleCost(M);
14102 if (Cost <= 1)
14103 return true;
14104 }
14105
14106 bool DummyBool;
14107 int DummyInt;
14108 unsigned DummyUnsigned;
14109
14110 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
14111 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
14112 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14113 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14114 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
14115 isZIPMask(M, VT, DummyUnsigned) ||
14116 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14117 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14118 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14119 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
14120 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14121}
14122
14124 EVT VT) const {
14125 // Just delegate to the generic legality, clear masks aren't special.
14126 return isShuffleMaskLegal(M, VT);
14127}
14128
14129/// getVShiftImm - Check if this is a valid build_vector for the immediate
14130/// operand of a vector shift operation, where all the elements of the
14131/// build_vector must have the same constant integer value.
14132static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14133 // Ignore bit_converts.
14134 while (Op.getOpcode() == ISD::BITCAST)
14135 Op = Op.getOperand(0);
14136 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14137 APInt SplatBits, SplatUndef;
14138 unsigned SplatBitSize;
14139 bool HasAnyUndefs;
14140 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14141 HasAnyUndefs, ElementBits) ||
14142 SplatBitSize > ElementBits)
14143 return false;
14144 Cnt = SplatBits.getSExtValue();
14145 return true;
14146}
14147
14148/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14149/// operand of a vector shift left operation. That value must be in the range:
14150/// 0 <= Value < ElementBits for a left shift; or
14151/// 0 <= Value <= ElementBits for a long left shift.
14152static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14153 assert(VT.isVector() && "vector shift count is not a vector type");
14154 int64_t ElementBits = VT.getScalarSizeInBits();
14155 if (!getVShiftImm(Op, ElementBits, Cnt))
14156 return false;
14157 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14158}
14159
14160/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14161/// operand of a vector shift right operation. The value must be in the range:
14162/// 1 <= Value <= ElementBits for a right shift; or
14163static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14164 assert(VT.isVector() && "vector shift count is not a vector type");
14165 int64_t ElementBits = VT.getScalarSizeInBits();
14166 if (!getVShiftImm(Op, ElementBits, Cnt))
14167 return false;
14168 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14169}
14170
14171SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14172 SelectionDAG &DAG) const {
14173 EVT VT = Op.getValueType();
14174
14175 if (VT.getScalarType() == MVT::i1) {
14176 // Lower i1 truncate to `(x & 1) != 0`.
14177 SDLoc dl(Op);
14178 EVT OpVT = Op.getOperand(0).getValueType();
14179 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14180 SDValue One = DAG.getConstant(1, dl, OpVT);
14181 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14182 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14183 }
14184
14185 if (!VT.isVector() || VT.isScalableVector())
14186 return SDValue();
14187
14188 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14189 !Subtarget->isNeonAvailable()))
14190 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14191
14192 return SDValue();
14193}
14194
14195// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14196// possibly a truncated type, it tells how many bits of the value are to be
14197// used.
14199 SelectionDAG &DAG,
14200 unsigned &ShiftValue,
14201 SDValue &RShOperand) {
14202 if (Shift->getOpcode() != ISD::SRL)
14203 return false;
14204
14205 EVT VT = Shift.getValueType();
14206 assert(VT.isScalableVT());
14207
14208 auto ShiftOp1 =
14209 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14210 if (!ShiftOp1)
14211 return false;
14212
14213 ShiftValue = ShiftOp1->getZExtValue();
14214 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14215 return false;
14216
14217 SDValue Add = Shift->getOperand(0);
14218 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14219 return false;
14220
14222 "ResVT must be truncated or same type as the shift.");
14223 // Check if an overflow can lead to incorrect results.
14224 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14225 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14226 return false;
14227
14228 auto AddOp1 =
14229 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14230 if (!AddOp1)
14231 return false;
14232 uint64_t AddValue = AddOp1->getZExtValue();
14233 if (AddValue != 1ULL << (ShiftValue - 1))
14234 return false;
14235
14236 RShOperand = Add->getOperand(0);
14237 return true;
14238}
14239
14240SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14241 SelectionDAG &DAG) const {
14242 EVT VT = Op.getValueType();
14243 SDLoc DL(Op);
14244 int64_t Cnt;
14245
14246 if (!Op.getOperand(1).getValueType().isVector())
14247 return Op;
14248 unsigned EltSize = VT.getScalarSizeInBits();
14249
14250 switch (Op.getOpcode()) {
14251 case ISD::SHL:
14252 if (VT.isScalableVector() ||
14254 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14255
14256 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14257 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14258 DAG.getConstant(Cnt, DL, MVT::i32));
14259 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14260 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14261 MVT::i32),
14262 Op.getOperand(0), Op.getOperand(1));
14263 case ISD::SRA:
14264 case ISD::SRL:
14265 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14266 SDValue RShOperand;
14267 unsigned ShiftValue;
14268 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14269 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14270 getPredicateForVector(DAG, DL, VT), RShOperand,
14271 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14272 }
14273
14274 if (VT.isScalableVector() ||
14275 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14276 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14278 return LowerToPredicatedOp(Op, DAG, Opc);
14279 }
14280
14281 // Right shift immediate
14282 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14283 unsigned Opc =
14284 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14285 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14286 DAG.getConstant(Cnt, DL, MVT::i32));
14287 }
14288
14289 // Right shift register. Note, there is not a shift right register
14290 // instruction, but the shift left register instruction takes a signed
14291 // value, where negative numbers specify a right shift.
14292 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14293 : Intrinsic::aarch64_neon_ushl;
14294 // negate the shift amount
14295 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14296 Op.getOperand(1));
14297 SDValue NegShiftLeft =
14299 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14300 NegShift);
14301 return NegShiftLeft;
14302 }
14303
14304 llvm_unreachable("unexpected shift opcode");
14305}
14306
14308 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14309 const SDLoc &dl, SelectionDAG &DAG) {
14310 EVT SrcVT = LHS.getValueType();
14311 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14312 "function only supposed to emit natural comparisons");
14313
14314 APInt SplatValue;
14315 APInt SplatUndef;
14316 unsigned SplatBitSize = 0;
14317 bool HasAnyUndefs;
14318
14319 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14320 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14321 SplatBitSize, HasAnyUndefs);
14322
14323 bool IsZero = IsCnst && SplatValue == 0;
14324 bool IsOne =
14325 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14326 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14327
14328 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14329 switch (CC) {
14330 default:
14331 return SDValue();
14332 case AArch64CC::NE: {
14333 SDValue Fcmeq;
14334 if (IsZero)
14335 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14336 else
14337 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14338 return DAG.getNOT(dl, Fcmeq, VT);
14339 }
14340 case AArch64CC::EQ:
14341 if (IsZero)
14342 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14343 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14344 case AArch64CC::GE:
14345 if (IsZero)
14346 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14347 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14348 case AArch64CC::GT:
14349 if (IsZero)
14350 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14351 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14352 case AArch64CC::LE:
14353 if (!NoNans)
14354 return SDValue();
14355 // If we ignore NaNs then we can use to the LS implementation.
14356 [[fallthrough]];
14357 case AArch64CC::LS:
14358 if (IsZero)
14359 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14360 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14361 case AArch64CC::LT:
14362 if (!NoNans)
14363 return SDValue();
14364 // If we ignore NaNs then we can use to the MI implementation.
14365 [[fallthrough]];
14366 case AArch64CC::MI:
14367 if (IsZero)
14368 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14369 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14370 }
14371 }
14372
14373 switch (CC) {
14374 default:
14375 return SDValue();
14376 case AArch64CC::NE: {
14377 SDValue Cmeq;
14378 if (IsZero)
14379 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14380 else
14381 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14382 return DAG.getNOT(dl, Cmeq, VT);
14383 }
14384 case AArch64CC::EQ:
14385 if (IsZero)
14386 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14387 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14388 case AArch64CC::GE:
14389 if (IsZero)
14390 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14391 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14392 case AArch64CC::GT:
14393 if (IsZero)
14394 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14395 if (IsMinusOne)
14396 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14397 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14398 case AArch64CC::LE:
14399 if (IsZero)
14400 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14401 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14402 case AArch64CC::LS:
14403 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14404 case AArch64CC::LO:
14405 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14406 case AArch64CC::LT:
14407 if (IsZero)
14408 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14409 if (IsOne)
14410 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14411 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14412 case AArch64CC::HI:
14413 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14414 case AArch64CC::HS:
14415 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14416 }
14417}
14418
14419SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14420 SelectionDAG &DAG) const {
14421 if (Op.getValueType().isScalableVector())
14422 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14423
14424 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14425 !Subtarget->isNeonAvailable()))
14426 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14427
14428 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14429 SDValue LHS = Op.getOperand(0);
14430 SDValue RHS = Op.getOperand(1);
14431 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14432 SDLoc dl(Op);
14433
14434 if (LHS.getValueType().getVectorElementType().isInteger()) {
14435 assert(LHS.getValueType() == RHS.getValueType());
14437 SDValue Cmp =
14438 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14439 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14440 }
14441
14442 // Lower isnan(x) | isnan(never-nan) to x != x.
14443 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14444 if (CC == ISD::SETUO || CC == ISD::SETO) {
14445 bool OneNaN = false;
14446 if (LHS == RHS) {
14447 OneNaN = true;
14448 } else if (DAG.isKnownNeverNaN(RHS)) {
14449 OneNaN = true;
14450 RHS = LHS;
14451 } else if (DAG.isKnownNeverNaN(LHS)) {
14452 OneNaN = true;
14453 LHS = RHS;
14454 }
14455 if (OneNaN) {
14457 }
14458 }
14459
14460 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14461
14462 // Make v4f16 (only) fcmp operations utilise vector instructions
14463 // v8f16 support will be a litle more complicated
14464 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14465 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14466 if (LHS.getValueType().getVectorNumElements() == 4) {
14467 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14468 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14469 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14470 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14471 CmpVT = MVT::v4i32;
14472 } else
14473 return SDValue();
14474 }
14475
14476 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14477 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14478 LHS.getValueType().getVectorElementType() != MVT::f128);
14479
14480 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14481 // clean. Some of them require two branches to implement.
14482 AArch64CC::CondCode CC1, CC2;
14483 bool ShouldInvert;
14484 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14485
14486 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14487 SDValue Cmp =
14488 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14489 if (!Cmp.getNode())
14490 return SDValue();
14491
14492 if (CC2 != AArch64CC::AL) {
14493 SDValue Cmp2 =
14494 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14495 if (!Cmp2.getNode())
14496 return SDValue();
14497
14498 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14499 }
14500
14501 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14502
14503 if (ShouldInvert)
14504 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14505
14506 return Cmp;
14507}
14508
14509static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14510 SelectionDAG &DAG) {
14511 SDValue VecOp = ScalarOp.getOperand(0);
14512 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14513 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14514 DAG.getConstant(0, DL, MVT::i64));
14515}
14516
14517static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14518 SDLoc DL, SelectionDAG &DAG) {
14519 unsigned ScalarOpcode;
14520 switch (Opcode) {
14521 case ISD::VECREDUCE_AND:
14522 ScalarOpcode = ISD::AND;
14523 break;
14524 case ISD::VECREDUCE_OR:
14525 ScalarOpcode = ISD::OR;
14526 break;
14527 case ISD::VECREDUCE_XOR:
14528 ScalarOpcode = ISD::XOR;
14529 break;
14530 default:
14531 llvm_unreachable("Expected bitwise vector reduction");
14532 return SDValue();
14533 }
14534
14535 EVT VecVT = Vec.getValueType();
14536 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14537 "Expected power-of-2 length vector");
14538
14539 EVT ElemVT = VecVT.getVectorElementType();
14540
14541 SDValue Result;
14542 unsigned NumElems = VecVT.getVectorNumElements();
14543
14544 // Special case for boolean reductions
14545 if (ElemVT == MVT::i1) {
14546 // Split large vectors into smaller ones
14547 if (NumElems > 16) {
14548 SDValue Lo, Hi;
14549 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14550 EVT HalfVT = Lo.getValueType();
14551 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14552 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14553 }
14554
14555 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14556 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14557 // this element size leads to the best codegen, since e.g. setcc results
14558 // might need to be truncated otherwise.
14559 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14560
14561 // any_ext doesn't work with umin/umax, so only use it for uadd.
14562 unsigned ExtendOp =
14563 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14564 SDValue Extended = DAG.getNode(
14565 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14566 switch (ScalarOpcode) {
14567 case ISD::AND:
14568 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14569 break;
14570 case ISD::OR:
14571 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14572 break;
14573 case ISD::XOR:
14574 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14575 break;
14576 default:
14577 llvm_unreachable("Unexpected Opcode");
14578 }
14579
14580 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14581 } else {
14582 // Iteratively split the vector in half and combine using the bitwise
14583 // operation until it fits in a 64 bit register.
14584 while (VecVT.getSizeInBits() > 64) {
14585 SDValue Lo, Hi;
14586 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14587 VecVT = Lo.getValueType();
14588 NumElems = VecVT.getVectorNumElements();
14589 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14590 }
14591
14592 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14593
14594 // Do the remaining work on a scalar since it allows the code generator to
14595 // combine the shift and bitwise operation into one instruction and since
14596 // integer instructions can have higher throughput than vector instructions.
14597 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14598
14599 // Iteratively combine the lower and upper halves of the scalar using the
14600 // bitwise operation, halving the relevant region of the scalar in each
14601 // iteration, until the relevant region is just one element of the original
14602 // vector.
14603 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14604 SDValue ShiftAmount =
14605 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14606 SDValue Shifted =
14607 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14608 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14609 }
14610
14611 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14612 }
14613
14614 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14615}
14616
14617SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14618 SelectionDAG &DAG) const {
14619 SDValue Src = Op.getOperand(0);
14620
14621 // Try to lower fixed length reductions to SVE.
14622 EVT SrcVT = Src.getValueType();
14623 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14624 Op.getOpcode() == ISD::VECREDUCE_AND ||
14625 Op.getOpcode() == ISD::VECREDUCE_OR ||
14626 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14627 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14628 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14629 SrcVT.getVectorElementType() == MVT::i64);
14630 if (SrcVT.isScalableVector() ||
14632 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14633
14634 if (SrcVT.getVectorElementType() == MVT::i1)
14635 return LowerPredReductionToSVE(Op, DAG);
14636
14637 switch (Op.getOpcode()) {
14638 case ISD::VECREDUCE_ADD:
14639 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14640 case ISD::VECREDUCE_AND:
14641 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14642 case ISD::VECREDUCE_OR:
14643 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14645 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14647 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14649 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14651 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14652 case ISD::VECREDUCE_XOR:
14653 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14655 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14657 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14659 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14661 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14663 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14664 default:
14665 llvm_unreachable("Unhandled fixed length reduction");
14666 }
14667 }
14668
14669 // Lower NEON reductions.
14670 SDLoc dl(Op);
14671 switch (Op.getOpcode()) {
14672 case ISD::VECREDUCE_AND:
14673 case ISD::VECREDUCE_OR:
14674 case ISD::VECREDUCE_XOR:
14675 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14676 Op.getValueType(), dl, DAG);
14677 case ISD::VECREDUCE_ADD:
14678 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14680 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14682 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14684 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14686 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14687 default:
14688 llvm_unreachable("Unhandled reduction");
14689 }
14690}
14691
14692SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14693 SelectionDAG &DAG) const {
14694 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14695 // No point replacing if we don't have the relevant instruction/libcall anyway
14696 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14697 return SDValue();
14698
14699 // LSE has an atomic load-clear instruction, but not a load-and.
14700 SDLoc dl(Op);
14701 MVT VT = Op.getSimpleValueType();
14702 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14703 SDValue RHS = Op.getOperand(2);
14704 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14705 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14706 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14707 Op.getOperand(0), Op.getOperand(1), RHS,
14708 AN->getMemOperand());
14709}
14710
14711SDValue
14712AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14713 SelectionDAG &DAG) const {
14714
14715 SDLoc dl(Op);
14716 // Get the inputs.
14717 SDNode *Node = Op.getNode();
14718 SDValue Chain = Op.getOperand(0);
14719 SDValue Size = Op.getOperand(1);
14721 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14722 EVT VT = Node->getValueType(0);
14723
14725 "no-stack-arg-probe")) {
14726 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14727 Chain = SP.getValue(1);
14728 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14729 if (Align)
14730 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14731 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14732 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14733 SDValue Ops[2] = {SP, Chain};
14734 return DAG.getMergeValues(Ops, dl);
14735 }
14736
14737 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14738
14739 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14741 PtrVT, 0);
14742
14743 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14744 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14745 if (Subtarget->hasCustomCallingConv())
14746 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14747
14748 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14749 DAG.getConstant(4, dl, MVT::i64));
14750 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14751 Chain =
14752 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14753 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14754 DAG.getRegisterMask(Mask), Chain.getValue(1));
14755 // To match the actual intent better, we should read the output from X15 here
14756 // again (instead of potentially spilling it to the stack), but rereading Size
14757 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14758 // here.
14759
14760 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14761 DAG.getConstant(4, dl, MVT::i64));
14762
14763 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14764 Chain = SP.getValue(1);
14765 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14766 if (Align)
14767 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14768 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14769 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14770
14771 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14772
14773 SDValue Ops[2] = {SP, Chain};
14774 return DAG.getMergeValues(Ops, dl);
14775}
14776
14777SDValue
14778AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14779 SelectionDAG &DAG) const {
14780 // Get the inputs.
14781 SDNode *Node = Op.getNode();
14782 SDValue Chain = Op.getOperand(0);
14783 SDValue Size = Op.getOperand(1);
14784
14786 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14787 SDLoc dl(Op);
14788 EVT VT = Node->getValueType(0);
14789
14790 // Construct the new SP value in a GPR.
14791 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14792 Chain = SP.getValue(1);
14793 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14794 if (Align)
14795 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14796 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14797
14798 // Set the real SP to the new value with a probing loop.
14799 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14800 SDValue Ops[2] = {SP, Chain};
14801 return DAG.getMergeValues(Ops, dl);
14802}
14803
14804SDValue
14805AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14806 SelectionDAG &DAG) const {
14808
14809 if (Subtarget->isTargetWindows())
14810 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14811 else if (hasInlineStackProbe(MF))
14812 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14813 else
14814 return SDValue();
14815}
14816
14817// When x and y are extended, lower:
14818// avgfloor(x, y) -> (x + y) >> 1
14819// avgceil(x, y) -> (x + y + 1) >> 1
14820
14821// Otherwise, lower to:
14822// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14823// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14824SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14825 unsigned NewOp) const {
14826 if (Subtarget->hasSVE2())
14827 return LowerToPredicatedOp(Op, DAG, NewOp);
14828
14829 SDLoc dl(Op);
14830 SDValue OpA = Op->getOperand(0);
14831 SDValue OpB = Op->getOperand(1);
14832 EVT VT = Op.getValueType();
14833 bool IsCeil =
14834 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14835 bool IsSigned =
14836 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14837 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14838
14839 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14840
14841 auto IsZeroExtended = [&DAG](SDValue &Node) {
14842 KnownBits Known = DAG.computeKnownBits(Node, 0);
14843 return Known.Zero.isSignBitSet();
14844 };
14845
14846 auto IsSignExtended = [&DAG](SDValue &Node) {
14847 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14848 };
14849
14850 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14851 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14852 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14853 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14854 if (IsCeil)
14855 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14856 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14857 }
14858
14859 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14860 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14861
14862 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14863 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14864 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14865 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14866}
14867
14868SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14869 SelectionDAG &DAG) const {
14870 EVT VT = Op.getValueType();
14871 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14872
14873 SDLoc DL(Op);
14874 APInt MulImm = Op.getConstantOperandAPInt(0);
14875 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14876 VT);
14877}
14878
14879/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14880template <unsigned NumVecs>
14881static bool
14885 // Retrieve EC from first vector argument.
14886 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14888#ifndef NDEBUG
14889 // Check the assumption that all input vectors are the same type.
14890 for (unsigned I = 0; I < NumVecs; ++I)
14891 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14892 "Invalid type.");
14893#endif
14894 // memVT is `NumVecs * VT`.
14896 EC * NumVecs);
14897 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14898 Info.offset = 0;
14899 Info.align.reset();
14901 return true;
14902}
14903
14904/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14905/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14906/// specified in the intrinsic calls.
14908 const CallInst &I,
14909 MachineFunction &MF,
14910 unsigned Intrinsic) const {
14911 auto &DL = I.getModule()->getDataLayout();
14912 switch (Intrinsic) {
14913 case Intrinsic::aarch64_sve_st2:
14914 return setInfoSVEStN<2>(*this, DL, Info, I);
14915 case Intrinsic::aarch64_sve_st3:
14916 return setInfoSVEStN<3>(*this, DL, Info, I);
14917 case Intrinsic::aarch64_sve_st4:
14918 return setInfoSVEStN<4>(*this, DL, Info, I);
14919 case Intrinsic::aarch64_neon_ld2:
14920 case Intrinsic::aarch64_neon_ld3:
14921 case Intrinsic::aarch64_neon_ld4:
14922 case Intrinsic::aarch64_neon_ld1x2:
14923 case Intrinsic::aarch64_neon_ld1x3:
14924 case Intrinsic::aarch64_neon_ld1x4: {
14926 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14927 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14928 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14929 Info.offset = 0;
14930 Info.align.reset();
14931 // volatile loads with NEON intrinsics not supported
14933 return true;
14934 }
14935 case Intrinsic::aarch64_neon_ld2lane:
14936 case Intrinsic::aarch64_neon_ld3lane:
14937 case Intrinsic::aarch64_neon_ld4lane:
14938 case Intrinsic::aarch64_neon_ld2r:
14939 case Intrinsic::aarch64_neon_ld3r:
14940 case Intrinsic::aarch64_neon_ld4r: {
14942 // ldx return struct with the same vec type
14943 Type *RetTy = I.getType();
14944 auto *StructTy = cast<StructType>(RetTy);
14945 unsigned NumElts = StructTy->getNumElements();
14946 Type *VecTy = StructTy->getElementType(0);
14947 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14948 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14949 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14950 Info.offset = 0;
14951 Info.align.reset();
14952 // volatile loads with NEON intrinsics not supported
14954 return true;
14955 }
14956 case Intrinsic::aarch64_neon_st2:
14957 case Intrinsic::aarch64_neon_st3:
14958 case Intrinsic::aarch64_neon_st4:
14959 case Intrinsic::aarch64_neon_st1x2:
14960 case Intrinsic::aarch64_neon_st1x3:
14961 case Intrinsic::aarch64_neon_st1x4: {
14963 unsigned NumElts = 0;
14964 for (const Value *Arg : I.args()) {
14965 Type *ArgTy = Arg->getType();
14966 if (!ArgTy->isVectorTy())
14967 break;
14968 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14969 }
14970 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14971 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14972 Info.offset = 0;
14973 Info.align.reset();
14974 // volatile stores with NEON intrinsics not supported
14976 return true;
14977 }
14978 case Intrinsic::aarch64_neon_st2lane:
14979 case Intrinsic::aarch64_neon_st3lane:
14980 case Intrinsic::aarch64_neon_st4lane: {
14982 unsigned NumElts = 0;
14983 // all the vector type is same
14984 Type *VecTy = I.getArgOperand(0)->getType();
14985 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14986
14987 for (const Value *Arg : I.args()) {
14988 Type *ArgTy = Arg->getType();
14989 if (!ArgTy->isVectorTy())
14990 break;
14991 NumElts += 1;
14992 }
14993
14994 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14995 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14996 Info.offset = 0;
14997 Info.align.reset();
14998 // volatile stores with NEON intrinsics not supported
15000 return true;
15001 }
15002 case Intrinsic::aarch64_ldaxr:
15003 case Intrinsic::aarch64_ldxr: {
15004 Type *ValTy = I.getParamElementType(0);
15006 Info.memVT = MVT::getVT(ValTy);
15007 Info.ptrVal = I.getArgOperand(0);
15008 Info.offset = 0;
15009 Info.align = DL.getABITypeAlign(ValTy);
15011 return true;
15012 }
15013 case Intrinsic::aarch64_stlxr:
15014 case Intrinsic::aarch64_stxr: {
15015 Type *ValTy = I.getParamElementType(1);
15017 Info.memVT = MVT::getVT(ValTy);
15018 Info.ptrVal = I.getArgOperand(1);
15019 Info.offset = 0;
15020 Info.align = DL.getABITypeAlign(ValTy);
15022 return true;
15023 }
15024 case Intrinsic::aarch64_ldaxp:
15025 case Intrinsic::aarch64_ldxp:
15027 Info.memVT = MVT::i128;
15028 Info.ptrVal = I.getArgOperand(0);
15029 Info.offset = 0;
15030 Info.align = Align(16);
15032 return true;
15033 case Intrinsic::aarch64_stlxp:
15034 case Intrinsic::aarch64_stxp:
15036 Info.memVT = MVT::i128;
15037 Info.ptrVal = I.getArgOperand(2);
15038 Info.offset = 0;
15039 Info.align = Align(16);
15041 return true;
15042 case Intrinsic::aarch64_sve_ldnt1: {
15043 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15045 Info.memVT = MVT::getVT(I.getType());
15046 Info.ptrVal = I.getArgOperand(1);
15047 Info.offset = 0;
15048 Info.align = DL.getABITypeAlign(ElTy);
15050 return true;
15051 }
15052 case Intrinsic::aarch64_sve_stnt1: {
15053 Type *ElTy =
15054 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15056 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15057 Info.ptrVal = I.getArgOperand(2);
15058 Info.offset = 0;
15059 Info.align = DL.getABITypeAlign(ElTy);
15061 return true;
15062 }
15063 case Intrinsic::aarch64_mops_memset_tag: {
15064 Value *Dst = I.getArgOperand(0);
15065 Value *Val = I.getArgOperand(1);
15067 Info.memVT = MVT::getVT(Val->getType());
15068 Info.ptrVal = Dst;
15069 Info.offset = 0;
15070 Info.align = I.getParamAlign(0).valueOrOne();
15072 // The size of the memory being operated on is unknown at this point
15074 return true;
15075 }
15076 default:
15077 break;
15078 }
15079
15080 return false;
15081}
15082
15084 ISD::LoadExtType ExtTy,
15085 EVT NewVT) const {
15086 // TODO: This may be worth removing. Check regression tests for diffs.
15087 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15088 return false;
15089
15090 // If we're reducing the load width in order to avoid having to use an extra
15091 // instruction to do extension then it's probably a good idea.
15092 if (ExtTy != ISD::NON_EXTLOAD)
15093 return true;
15094 // Don't reduce load width if it would prevent us from combining a shift into
15095 // the offset.
15096 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15097 assert(Mem);
15098 const SDValue &Base = Mem->getBasePtr();
15099 if (Base.getOpcode() == ISD::ADD &&
15100 Base.getOperand(1).getOpcode() == ISD::SHL &&
15101 Base.getOperand(1).hasOneUse() &&
15102 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15103 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15104 if (Mem->getMemoryVT().isScalableVector())
15105 return false;
15106 // The shift can be combined if it matches the size of the value being
15107 // loaded (and so reducing the width would make it not match).
15108 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15109 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15110 if (ShiftAmount == Log2_32(LoadBytes))
15111 return false;
15112 }
15113 // We have no reason to disallow reducing the load width, so allow it.
15114 return true;
15115}
15116
15117// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15119 EVT VT = Extend.getValueType();
15120 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15121 SDValue Extract = Extend.getOperand(0);
15122 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15123 Extract = Extract.getOperand(0);
15124 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15125 EVT VecVT = Extract.getOperand(0).getValueType();
15126 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15127 return false;
15128 }
15129 }
15130 return true;
15131}
15132
15133// Truncations from 64-bit GPR to 32-bit GPR is free.
15135 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15136 return false;
15137 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15138 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15139 return NumBits1 > NumBits2;
15140}
15142 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15143 return false;
15144 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15145 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15146 return NumBits1 > NumBits2;
15147}
15148
15149/// Check if it is profitable to hoist instruction in then/else to if.
15150/// Not profitable if I and it's user can form a FMA instruction
15151/// because we prefer FMSUB/FMADD.
15153 if (I->getOpcode() != Instruction::FMul)
15154 return true;
15155
15156 if (!I->hasOneUse())
15157 return true;
15158
15159 Instruction *User = I->user_back();
15160
15161 if (!(User->getOpcode() == Instruction::FSub ||
15162 User->getOpcode() == Instruction::FAdd))
15163 return true;
15164
15166 const Function *F = I->getFunction();
15167 const DataLayout &DL = F->getParent()->getDataLayout();
15168 Type *Ty = User->getOperand(0)->getType();
15169
15170 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15172 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15173 Options.UnsafeFPMath));
15174}
15175
15176// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15177// 64-bit GPR.
15179 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15180 return false;
15181 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15182 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15183 return NumBits1 == 32 && NumBits2 == 64;
15184}
15186 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15187 return false;
15188 unsigned NumBits1 = VT1.getSizeInBits();
15189 unsigned NumBits2 = VT2.getSizeInBits();
15190 return NumBits1 == 32 && NumBits2 == 64;
15191}
15192
15194 EVT VT1 = Val.getValueType();
15195 if (isZExtFree(VT1, VT2)) {
15196 return true;
15197 }
15198
15199 if (Val.getOpcode() != ISD::LOAD)
15200 return false;
15201
15202 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15203 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15204 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15205 VT1.getSizeInBits() <= 32);
15206}
15207
15208bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15209 if (isa<FPExtInst>(Ext))
15210 return false;
15211
15212 // Vector types are not free.
15213 if (Ext->getType()->isVectorTy())
15214 return false;
15215
15216 for (const Use &U : Ext->uses()) {
15217 // The extension is free if we can fold it with a left shift in an
15218 // addressing mode or an arithmetic operation: add, sub, and cmp.
15219
15220 // Is there a shift?
15221 const Instruction *Instr = cast<Instruction>(U.getUser());
15222
15223 // Is this a constant shift?
15224 switch (Instr->getOpcode()) {
15225 case Instruction::Shl:
15226 if (!isa<ConstantInt>(Instr->getOperand(1)))
15227 return false;
15228 break;
15229 case Instruction::GetElementPtr: {
15230 gep_type_iterator GTI = gep_type_begin(Instr);
15231 auto &DL = Ext->getModule()->getDataLayout();
15232 std::advance(GTI, U.getOperandNo()-1);
15233 Type *IdxTy = GTI.getIndexedType();
15234 // This extension will end up with a shift because of the scaling factor.
15235 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15236 // Get the shift amount based on the scaling factor:
15237 // log2(sizeof(IdxTy)) - log2(8).
15238 if (IdxTy->isScalableTy())
15239 return false;
15240 uint64_t ShiftAmt =
15241 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15242 3;
15243 // Is the constant foldable in the shift of the addressing mode?
15244 // I.e., shift amount is between 1 and 4 inclusive.
15245 if (ShiftAmt == 0 || ShiftAmt > 4)
15246 return false;
15247 break;
15248 }
15249 case Instruction::Trunc:
15250 // Check if this is a noop.
15251 // trunc(sext ty1 to ty2) to ty1.
15252 if (Instr->getType() == Ext->getOperand(0)->getType())
15253 continue;
15254 [[fallthrough]];
15255 default:
15256 return false;
15257 }
15258
15259 // At this point we can use the bfm family, so this extension is free
15260 // for that use.
15261 }
15262 return true;
15263}
15264
15265static bool isSplatShuffle(Value *V) {
15266 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15267 return all_equal(Shuf->getShuffleMask());
15268 return false;
15269}
15270
15271/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15272/// or upper half of the vector elements.
15273static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15274 bool AllowSplat = false) {
15275 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15276 auto *FullTy = FullV->getType();
15277 auto *HalfTy = HalfV->getType();
15278 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15279 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15280 };
15281
15282 auto extractHalf = [](Value *FullV, Value *HalfV) {
15283 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15284 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15285 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15286 };
15287
15288 ArrayRef<int> M1, M2;
15289 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15290 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15291 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15292 return false;
15293
15294 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15295 // it is not checked as an extract below.
15296 if (AllowSplat && isSplatShuffle(Op1))
15297 S1Op1 = nullptr;
15298 if (AllowSplat && isSplatShuffle(Op2))
15299 S2Op1 = nullptr;
15300
15301 // Check that the operands are half as wide as the result and we extract
15302 // half of the elements of the input vectors.
15303 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15304 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15305 return false;
15306
15307 // Check the mask extracts either the lower or upper half of vector
15308 // elements.
15309 int M1Start = 0;
15310 int M2Start = 0;
15311 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15312 if ((S1Op1 &&
15313 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15314 (S2Op1 &&
15315 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15316 return false;
15317
15318 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15319 (M2Start != 0 && M2Start != (NumElements / 2)))
15320 return false;
15321 if (S1Op1 && S2Op1 && M1Start != M2Start)
15322 return false;
15323
15324 return true;
15325}
15326
15327/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15328/// of the vector elements.
15329static bool areExtractExts(Value *Ext1, Value *Ext2) {
15330 auto areExtDoubled = [](Instruction *Ext) {
15331 return Ext->getType()->getScalarSizeInBits() ==
15332 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15333 };
15334
15335 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15336 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15337 !areExtDoubled(cast<Instruction>(Ext1)) ||
15338 !areExtDoubled(cast<Instruction>(Ext2)))
15339 return false;
15340
15341 return true;
15342}
15343
15344/// Check if Op could be used with vmull_high_p64 intrinsic.
15346 Value *VectorOperand = nullptr;
15347 ConstantInt *ElementIndex = nullptr;
15348 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15349 m_ConstantInt(ElementIndex))) &&
15350 ElementIndex->getValue() == 1 &&
15351 isa<FixedVectorType>(VectorOperand->getType()) &&
15352 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15353}
15354
15355/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15356static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15358}
15359
15361 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15362 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15363 if (!GEP || GEP->getNumOperands() != 2)
15364 return false;
15365
15366 Value *Base = GEP->getOperand(0);
15367 Value *Offsets = GEP->getOperand(1);
15368
15369 // We only care about scalar_base+vector_offsets.
15370 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15371 return false;
15372
15373 // Sink extends that would allow us to use 32-bit offset vectors.
15374 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15375 auto *OffsetsInst = cast<Instruction>(Offsets);
15376 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15377 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15378 Ops.push_back(&GEP->getOperandUse(1));
15379 }
15380
15381 // Sink the GEP.
15382 return true;
15383}
15384
15385/// We want to sink following cases:
15386/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15388 if (match(Op, m_VScale()))
15389 return true;
15390 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15392 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15393 return true;
15394 }
15395 return false;
15396}
15397
15398/// Check if sinking \p I's operands to I's basic block is profitable, because
15399/// the operands can be folded into a target instruction, e.g.
15400/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15402 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15403 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15404 switch (II->getIntrinsicID()) {
15405 case Intrinsic::aarch64_neon_smull:
15406 case Intrinsic::aarch64_neon_umull:
15407 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15408 /*AllowSplat=*/true)) {
15409 Ops.push_back(&II->getOperandUse(0));
15410 Ops.push_back(&II->getOperandUse(1));
15411 return true;
15412 }
15413 [[fallthrough]];
15414
15415 case Intrinsic::fma:
15416 if (isa<VectorType>(I->getType()) &&
15417 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15418 !Subtarget->hasFullFP16())
15419 return false;
15420 [[fallthrough]];
15421 case Intrinsic::aarch64_neon_sqdmull:
15422 case Intrinsic::aarch64_neon_sqdmulh:
15423 case Intrinsic::aarch64_neon_sqrdmulh:
15424 // Sink splats for index lane variants
15425 if (isSplatShuffle(II->getOperand(0)))
15426 Ops.push_back(&II->getOperandUse(0));
15427 if (isSplatShuffle(II->getOperand(1)))
15428 Ops.push_back(&II->getOperandUse(1));
15429 return !Ops.empty();
15430 case Intrinsic::aarch64_neon_fmlal:
15431 case Intrinsic::aarch64_neon_fmlal2:
15432 case Intrinsic::aarch64_neon_fmlsl:
15433 case Intrinsic::aarch64_neon_fmlsl2:
15434 // Sink splats for index lane variants
15435 if (isSplatShuffle(II->getOperand(1)))
15436 Ops.push_back(&II->getOperandUse(1));
15437 if (isSplatShuffle(II->getOperand(2)))
15438 Ops.push_back(&II->getOperandUse(2));
15439 return !Ops.empty();
15440 case Intrinsic::aarch64_sve_ptest_first:
15441 case Intrinsic::aarch64_sve_ptest_last:
15442 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15443 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15444 Ops.push_back(&II->getOperandUse(0));
15445 return !Ops.empty();
15446 case Intrinsic::aarch64_sme_write_horiz:
15447 case Intrinsic::aarch64_sme_write_vert:
15448 case Intrinsic::aarch64_sme_writeq_horiz:
15449 case Intrinsic::aarch64_sme_writeq_vert: {
15450 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15451 if (!Idx || Idx->getOpcode() != Instruction::Add)
15452 return false;
15453 Ops.push_back(&II->getOperandUse(1));
15454 return true;
15455 }
15456 case Intrinsic::aarch64_sme_read_horiz:
15457 case Intrinsic::aarch64_sme_read_vert:
15458 case Intrinsic::aarch64_sme_readq_horiz:
15459 case Intrinsic::aarch64_sme_readq_vert:
15460 case Intrinsic::aarch64_sme_ld1b_vert:
15461 case Intrinsic::aarch64_sme_ld1h_vert:
15462 case Intrinsic::aarch64_sme_ld1w_vert:
15463 case Intrinsic::aarch64_sme_ld1d_vert:
15464 case Intrinsic::aarch64_sme_ld1q_vert:
15465 case Intrinsic::aarch64_sme_st1b_vert:
15466 case Intrinsic::aarch64_sme_st1h_vert:
15467 case Intrinsic::aarch64_sme_st1w_vert:
15468 case Intrinsic::aarch64_sme_st1d_vert:
15469 case Intrinsic::aarch64_sme_st1q_vert:
15470 case Intrinsic::aarch64_sme_ld1b_horiz:
15471 case Intrinsic::aarch64_sme_ld1h_horiz:
15472 case Intrinsic::aarch64_sme_ld1w_horiz:
15473 case Intrinsic::aarch64_sme_ld1d_horiz:
15474 case Intrinsic::aarch64_sme_ld1q_horiz:
15475 case Intrinsic::aarch64_sme_st1b_horiz:
15476 case Intrinsic::aarch64_sme_st1h_horiz:
15477 case Intrinsic::aarch64_sme_st1w_horiz:
15478 case Intrinsic::aarch64_sme_st1d_horiz:
15479 case Intrinsic::aarch64_sme_st1q_horiz: {
15480 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15481 if (!Idx || Idx->getOpcode() != Instruction::Add)
15482 return false;
15483 Ops.push_back(&II->getOperandUse(3));
15484 return true;
15485 }
15486 case Intrinsic::aarch64_neon_pmull:
15487 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15488 return false;
15489 Ops.push_back(&II->getOperandUse(0));
15490 Ops.push_back(&II->getOperandUse(1));
15491 return true;
15492 case Intrinsic::aarch64_neon_pmull64:
15493 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15494 II->getArgOperand(1)))
15495 return false;
15496 Ops.push_back(&II->getArgOperandUse(0));
15497 Ops.push_back(&II->getArgOperandUse(1));
15498 return true;
15499 case Intrinsic::masked_gather:
15500 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15501 return false;
15502 Ops.push_back(&II->getArgOperandUse(0));
15503 return true;
15504 case Intrinsic::masked_scatter:
15505 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15506 return false;
15507 Ops.push_back(&II->getArgOperandUse(1));
15508 return true;
15509 default:
15510 return false;
15511 }
15512 }
15513
15514 // Sink vscales closer to uses for better isel
15515 switch (I->getOpcode()) {
15516 case Instruction::GetElementPtr:
15517 case Instruction::Add:
15518 case Instruction::Sub:
15519 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15520 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15521 Ops.push_back(&I->getOperandUse(Op));
15522 return true;
15523 }
15524 }
15525 break;
15526 default:
15527 break;
15528 }
15529
15530 if (!I->getType()->isVectorTy())
15531 return false;
15532
15533 switch (I->getOpcode()) {
15534 case Instruction::Sub:
15535 case Instruction::Add: {
15536 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15537 return false;
15538
15539 // If the exts' operands extract either the lower or upper elements, we
15540 // can sink them too.
15541 auto Ext1 = cast<Instruction>(I->getOperand(0));
15542 auto Ext2 = cast<Instruction>(I->getOperand(1));
15543 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15544 Ops.push_back(&Ext1->getOperandUse(0));
15545 Ops.push_back(&Ext2->getOperandUse(0));
15546 }
15547
15548 Ops.push_back(&I->getOperandUse(0));
15549 Ops.push_back(&I->getOperandUse(1));
15550
15551 return true;
15552 }
15553 case Instruction::Or: {
15554 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15555 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15556 if (Subtarget->hasNEON()) {
15557 Instruction *OtherAnd, *IA, *IB;
15558 Value *MaskValue;
15559 // MainAnd refers to And instruction that has 'Not' as one of its operands
15560 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15561 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15562 m_Instruction(IA)))))) {
15563 if (match(OtherAnd,
15564 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15565 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15566 ? cast<Instruction>(I->getOperand(1))
15567 : cast<Instruction>(I->getOperand(0));
15568
15569 // Both Ands should be in same basic block as Or
15570 if (I->getParent() != MainAnd->getParent() ||
15571 I->getParent() != OtherAnd->getParent())
15572 return false;
15573
15574 // Non-mask operands of both Ands should also be in same basic block
15575 if (I->getParent() != IA->getParent() ||
15576 I->getParent() != IB->getParent())
15577 return false;
15578
15579 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15580 Ops.push_back(&I->getOperandUse(0));
15581 Ops.push_back(&I->getOperandUse(1));
15582
15583 return true;
15584 }
15585 }
15586 }
15587
15588 return false;
15589 }
15590 case Instruction::Mul: {
15591 int NumZExts = 0, NumSExts = 0;
15592 for (auto &Op : I->operands()) {
15593 // Make sure we are not already sinking this operand
15594 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15595 continue;
15596
15597 if (match(&Op, m_SExt(m_Value()))) {
15598 NumSExts++;
15599 continue;
15600 } else if (match(&Op, m_ZExt(m_Value()))) {
15601 NumZExts++;
15602 continue;
15603 }
15604
15605 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15606
15607 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15608 // operand and the s/zext can help create indexed s/umull. This is
15609 // especially useful to prevent i64 mul being scalarized.
15610 if (Shuffle && isSplatShuffle(Shuffle) &&
15611 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15612 Ops.push_back(&Shuffle->getOperandUse(0));
15613 Ops.push_back(&Op);
15614 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15615 NumSExts++;
15616 else
15617 NumZExts++;
15618 continue;
15619 }
15620
15621 if (!Shuffle)
15622 continue;
15623
15624 Value *ShuffleOperand = Shuffle->getOperand(0);
15625 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15626 if (!Insert)
15627 continue;
15628
15629 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15630 if (!OperandInstr)
15631 continue;
15632
15633 ConstantInt *ElementConstant =
15634 dyn_cast<ConstantInt>(Insert->getOperand(2));
15635 // Check that the insertelement is inserting into element 0
15636 if (!ElementConstant || !ElementConstant->isZero())
15637 continue;
15638
15639 unsigned Opcode = OperandInstr->getOpcode();
15640 if (Opcode == Instruction::SExt)
15641 NumSExts++;
15642 else if (Opcode == Instruction::ZExt)
15643 NumZExts++;
15644 else {
15645 // If we find that the top bits are known 0, then we can sink and allow
15646 // the backend to generate a umull.
15647 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15648 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15649 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15650 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15651 continue;
15652 NumZExts++;
15653 }
15654
15655 Ops.push_back(&Shuffle->getOperandUse(0));
15656 Ops.push_back(&Op);
15657 }
15658
15659 // Is it profitable to sink if we found two of the same type of extends.
15660 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15661 }
15662 default:
15663 return false;
15664 }
15665 return false;
15666}
15667
15669 bool IsLittleEndian) {
15670 Value *Op = ZExt->getOperand(0);
15671 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15672 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15673 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15674 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15675 return false;
15676
15677 assert(DstWidth % SrcWidth == 0 &&
15678 "TBL lowering is not supported for a ZExt instruction with this "
15679 "source & destination element type.");
15680 unsigned ZExtFactor = DstWidth / SrcWidth;
15681 unsigned NumElts = SrcTy->getNumElements();
15682 IRBuilder<> Builder(ZExt);
15683 SmallVector<int> Mask;
15684 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15685 // vector to replace the original ZExt. This can later be lowered to a set of
15686 // tbl instructions.
15687 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15688 if (IsLittleEndian) {
15689 if (i % ZExtFactor == 0)
15690 Mask.push_back(i / ZExtFactor);
15691 else
15692 Mask.push_back(NumElts);
15693 } else {
15694 if ((i + 1) % ZExtFactor == 0)
15695 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15696 else
15697 Mask.push_back(NumElts);
15698 }
15699 }
15700
15701 auto *FirstEltZero = Builder.CreateInsertElement(
15702 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15703 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15704 Result = Builder.CreateBitCast(Result, DstTy);
15705 if (DstTy != ZExt->getType())
15706 Result = Builder.CreateZExt(Result, ZExt->getType());
15707 ZExt->replaceAllUsesWith(Result);
15708 ZExt->eraseFromParent();
15709 return true;
15710}
15711
15712static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15713 IRBuilder<> Builder(TI);
15715 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15716 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15717 auto *DstTy = cast<FixedVectorType>(TI->getType());
15718 assert(SrcTy->getElementType()->isIntegerTy() &&
15719 "Non-integer type source vector element is not supported");
15720 assert(DstTy->getElementType()->isIntegerTy(8) &&
15721 "Unsupported destination vector element type");
15722 unsigned SrcElemTySz =
15723 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15724 unsigned DstElemTySz =
15725 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15726 assert((SrcElemTySz % DstElemTySz == 0) &&
15727 "Cannot lower truncate to tbl instructions for a source element size "
15728 "that is not divisible by the destination element size");
15729 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15730 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15731 "Unsupported source vector element type size");
15732 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15733
15734 // Create a mask to choose every nth byte from the source vector table of
15735 // bytes to create the truncated destination vector, where 'n' is the truncate
15736 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15737 // 0,8,16,..Y*8th bytes for the little-endian format
15739 for (int Itr = 0; Itr < 16; Itr++) {
15740 if (Itr < NumElements)
15741 MaskConst.push_back(Builder.getInt8(
15742 IsLittleEndian ? Itr * TruncFactor
15743 : Itr * TruncFactor + (TruncFactor - 1)));
15744 else
15745 MaskConst.push_back(Builder.getInt8(255));
15746 }
15747
15748 int MaxTblSz = 128 * 4;
15749 int MaxSrcSz = SrcElemTySz * NumElements;
15750 int ElemsPerTbl =
15751 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15752 assert(ElemsPerTbl <= 16 &&
15753 "Maximum elements selected using TBL instruction cannot exceed 16!");
15754
15755 int ShuffleCount = 128 / SrcElemTySz;
15756 SmallVector<int> ShuffleLanes;
15757 for (int i = 0; i < ShuffleCount; ++i)
15758 ShuffleLanes.push_back(i);
15759
15760 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15761 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15762 // call TBL & save the result in a vector of TBL results for combining later.
15764 while (ShuffleLanes.back() < NumElements) {
15765 Parts.push_back(Builder.CreateBitCast(
15766 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15767
15768 if (Parts.size() == 4) {
15770 Intrinsic::aarch64_neon_tbl4, VecTy);
15771 Parts.push_back(ConstantVector::get(MaskConst));
15772 Results.push_back(Builder.CreateCall(F, Parts));
15773 Parts.clear();
15774 }
15775
15776 for (int i = 0; i < ShuffleCount; ++i)
15777 ShuffleLanes[i] += ShuffleCount;
15778 }
15779
15780 assert((Parts.empty() || Results.empty()) &&
15781 "Lowering trunc for vectors requiring different TBL instructions is "
15782 "not supported!");
15783 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15784 // registers
15785 if (!Parts.empty()) {
15786 Intrinsic::ID TblID;
15787 switch (Parts.size()) {
15788 case 1:
15789 TblID = Intrinsic::aarch64_neon_tbl1;
15790 break;
15791 case 2:
15792 TblID = Intrinsic::aarch64_neon_tbl2;
15793 break;
15794 case 3:
15795 TblID = Intrinsic::aarch64_neon_tbl3;
15796 break;
15797 }
15798
15799 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15800 Parts.push_back(ConstantVector::get(MaskConst));
15801 Results.push_back(Builder.CreateCall(F, Parts));
15802 }
15803
15804 // Extract the destination vector from TBL result(s) after combining them
15805 // where applicable. Currently, at most two TBLs are supported.
15806 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15807 "more than 2 tbl instructions!");
15808 Value *FinalResult = Results[0];
15809 if (Results.size() == 1) {
15810 if (ElemsPerTbl < 16) {
15811 SmallVector<int> FinalMask(ElemsPerTbl);
15812 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15813 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15814 }
15815 } else {
15816 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15817 if (ElemsPerTbl < 16) {
15818 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15819 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15820 } else {
15821 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15822 }
15823 FinalResult =
15824 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15825 }
15826
15827 TI->replaceAllUsesWith(FinalResult);
15828 TI->eraseFromParent();
15829}
15830
15832 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15833 // shuffle_vector instructions are serialized when targeting SVE,
15834 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15835 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15836 return false;
15837
15838 // Try to optimize conversions using tbl. This requires materializing constant
15839 // index vectors, which can increase code size and add loads. Skip the
15840 // transform unless the conversion is in a loop block guaranteed to execute
15841 // and we are not optimizing for size.
15842 Function *F = I->getParent()->getParent();
15843 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15844 F->hasOptSize())
15845 return false;
15846
15847 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15848 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15849 if (!SrcTy || !DstTy)
15850 return false;
15851
15852 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15853 // lowered to tbl instructions to insert the original i8 elements
15854 // into i8x lanes. This is enabled for cases where it is beneficial.
15855 auto *ZExt = dyn_cast<ZExtInst>(I);
15856 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15857 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15858 if (DstWidth % 8 != 0)
15859 return false;
15860
15861 auto *TruncDstType =
15862 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15863 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15864 // the remaining ZExt folded into the user, don't use tbl lowering.
15865 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15866 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15869 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15870 return false;
15871
15872 DstTy = TruncDstType;
15873 }
15874
15875 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15876 }
15877
15878 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15879 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15880 DstTy->getElementType()->isFloatTy()) {
15881 IRBuilder<> Builder(I);
15882 auto *ZExt = cast<ZExtInst>(
15883 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15884 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15885 I->replaceAllUsesWith(UI);
15886 I->eraseFromParent();
15887 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15888 Subtarget->isLittleEndian());
15889 }
15890
15891 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15892 // followed by a truncate lowered to using tbl.4.
15893 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15894 if (FPToUI &&
15895 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15896 SrcTy->getElementType()->isFloatTy() &&
15897 DstTy->getElementType()->isIntegerTy(8)) {
15898 IRBuilder<> Builder(I);
15899 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15900 VectorType::getInteger(SrcTy));
15901 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15902 I->replaceAllUsesWith(TruncI);
15903 I->eraseFromParent();
15904 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15905 return true;
15906 }
15907
15908 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15909 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15910 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15911 // registers
15912 auto *TI = dyn_cast<TruncInst>(I);
15913 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15914 ((SrcTy->getElementType()->isIntegerTy(32) ||
15915 SrcTy->getElementType()->isIntegerTy(64)) &&
15916 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15917 createTblForTrunc(TI, Subtarget->isLittleEndian());
15918 return true;
15919 }
15920
15921 return false;
15922}
15923
15925 Align &RequiredAligment) const {
15926 if (!LoadedType.isSimple() ||
15927 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15928 return false;
15929 // Cyclone supports unaligned accesses.
15930 RequiredAligment = Align(1);
15931 unsigned NumBits = LoadedType.getSizeInBits();
15932 return NumBits == 32 || NumBits == 64;
15933}
15934
15935/// A helper function for determining the number of interleaved accesses we
15936/// will generate when lowering accesses of the given type.
15938 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15939 unsigned VecSize = 128;
15940 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15941 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15942 if (UseScalable && isa<FixedVectorType>(VecTy))
15943 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15944 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15945}
15946
15949 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15950 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15951 return MOStridedAccess;
15953}
15954
15956 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15957 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15958 auto EC = VecTy->getElementCount();
15959 unsigned MinElts = EC.getKnownMinValue();
15960
15961 UseScalable = false;
15962
15963 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15964 return false;
15965
15966 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15967 return false;
15968
15969 // Ensure that the predicate for this number of elements is available.
15970 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15971 return false;
15972
15973 // Ensure the number of vector elements is greater than 1.
15974 if (MinElts < 2)
15975 return false;
15976
15977 // Ensure the element type is legal.
15978 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15979 return false;
15980
15981 if (EC.isScalable()) {
15982 UseScalable = true;
15983 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
15984 }
15985
15986 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
15987 if (!Subtarget->isNeonAvailable() ||
15988 (Subtarget->useSVEForFixedLengthVectors() &&
15989 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15990 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15991 isPowerOf2_32(MinElts) && VecSize > 128)))) {
15992 UseScalable = true;
15993 return true;
15994 }
15995
15996 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
15997 // 128 will be split into multiple interleaved accesses.
15998 return VecSize == 64 || VecSize % 128 == 0;
15999}
16000
16002 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16003 return ScalableVectorType::get(VTy->getElementType(), 2);
16004
16005 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16006 return ScalableVectorType::get(VTy->getElementType(), 4);
16007
16008 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16009 return ScalableVectorType::get(VTy->getElementType(), 8);
16010
16011 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16012 return ScalableVectorType::get(VTy->getElementType(), 8);
16013
16014 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16015 return ScalableVectorType::get(VTy->getElementType(), 2);
16016
16017 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16018 return ScalableVectorType::get(VTy->getElementType(), 4);
16019
16020 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16021 return ScalableVectorType::get(VTy->getElementType(), 8);
16022
16023 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16024 return ScalableVectorType::get(VTy->getElementType(), 16);
16025
16026 llvm_unreachable("Cannot handle input vector type");
16027}
16028
16029static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16030 bool Scalable, Type *LDVTy,
16031 Type *PtrTy) {
16032 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16033 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16034 Intrinsic::aarch64_sve_ld3_sret,
16035 Intrinsic::aarch64_sve_ld4_sret};
16036 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16037 Intrinsic::aarch64_neon_ld3,
16038 Intrinsic::aarch64_neon_ld4};
16039 if (Scalable)
16040 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16041
16042 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16043}
16044
16045static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16046 bool Scalable, Type *STVTy,
16047 Type *PtrTy) {
16048 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16049 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16050 Intrinsic::aarch64_sve_st3,
16051 Intrinsic::aarch64_sve_st4};
16052 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16053 Intrinsic::aarch64_neon_st3,
16054 Intrinsic::aarch64_neon_st4};
16055 if (Scalable)
16056 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16057
16058 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16059}
16060
16061/// Lower an interleaved load into a ldN intrinsic.
16062///
16063/// E.g. Lower an interleaved load (Factor = 2):
16064/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16065/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16066/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16067///
16068/// Into:
16069/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16070/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16071/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16074 ArrayRef<unsigned> Indices, unsigned Factor) const {
16075 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16076 "Invalid interleave factor");
16077 assert(!Shuffles.empty() && "Empty shufflevector input");
16078 assert(Shuffles.size() == Indices.size() &&
16079 "Unmatched number of shufflevectors and indices");
16080
16081 const DataLayout &DL = LI->getModule()->getDataLayout();
16082
16083 VectorType *VTy = Shuffles[0]->getType();
16084
16085 // Skip if we do not have NEON and skip illegal vector types. We can
16086 // "legalize" wide vector types into multiple interleaved accesses as long as
16087 // the vector types are divisible by 128.
16088 bool UseScalable;
16089 if (!Subtarget->hasNEON() ||
16090 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
16091 return false;
16092
16093 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16094
16095 auto *FVTy = cast<FixedVectorType>(VTy);
16096
16097 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16098 // load integer vectors first and then convert to pointer vectors.
16099 Type *EltTy = FVTy->getElementType();
16100 if (EltTy->isPointerTy())
16101 FVTy =
16102 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16103
16104 // If we're going to generate more than one load, reset the sub-vector type
16105 // to something legal.
16106 FVTy = FixedVectorType::get(FVTy->getElementType(),
16107 FVTy->getNumElements() / NumLoads);
16108
16109 auto *LDVTy =
16110 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16111
16112 IRBuilder<> Builder(LI);
16113
16114 // The base address of the load.
16115 Value *BaseAddr = LI->getPointerOperand();
16116
16117 Type *PtrTy = LI->getPointerOperandType();
16118 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16119 LDVTy->getElementCount());
16120
16121 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16122 UseScalable, LDVTy, PtrTy);
16123
16124 // Holds sub-vectors extracted from the load intrinsic return values. The
16125 // sub-vectors are associated with the shufflevector instructions they will
16126 // replace.
16128
16129 Value *PTrue = nullptr;
16130 if (UseScalable) {
16131 std::optional<unsigned> PgPattern =
16132 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16133 if (Subtarget->getMinSVEVectorSizeInBits() ==
16134 Subtarget->getMaxSVEVectorSizeInBits() &&
16135 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16136 PgPattern = AArch64SVEPredPattern::all;
16137
16138 auto *PTruePat =
16139 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16140 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16141 {PTruePat});
16142 }
16143
16144 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16145
16146 // If we're generating more than one load, compute the base address of
16147 // subsequent loads as an offset from the previous.
16148 if (LoadCount > 0)
16149 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16150 FVTy->getNumElements() * Factor);
16151
16152 CallInst *LdN;
16153 if (UseScalable)
16154 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16155 else
16156 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16157
16158 // Extract and store the sub-vectors returned by the load intrinsic.
16159 for (unsigned i = 0; i < Shuffles.size(); i++) {
16160 ShuffleVectorInst *SVI = Shuffles[i];
16161 unsigned Index = Indices[i];
16162
16163 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16164
16165 if (UseScalable)
16166 SubVec = Builder.CreateExtractVector(
16167 FVTy, SubVec,
16168 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16169
16170 // Convert the integer vector to pointer vector if the element is pointer.
16171 if (EltTy->isPointerTy())
16172 SubVec = Builder.CreateIntToPtr(
16174 FVTy->getNumElements()));
16175
16176 SubVecs[SVI].push_back(SubVec);
16177 }
16178 }
16179
16180 // Replace uses of the shufflevector instructions with the sub-vectors
16181 // returned by the load intrinsic. If a shufflevector instruction is
16182 // associated with more than one sub-vector, those sub-vectors will be
16183 // concatenated into a single wide vector.
16184 for (ShuffleVectorInst *SVI : Shuffles) {
16185 auto &SubVec = SubVecs[SVI];
16186 auto *WideVec =
16187 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16188 SVI->replaceAllUsesWith(WideVec);
16189 }
16190
16191 return true;
16192}
16193
16194template <typename Iter>
16195bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16196 int MaxLookupDist = 20;
16197 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16198 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16199 const Value *PtrA1 =
16200 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16201
16202 while (++It != End) {
16203 if (It->isDebugOrPseudoInst())
16204 continue;
16205 if (MaxLookupDist-- == 0)
16206 break;
16207 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16208 const Value *PtrB1 =
16209 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16210 DL, OffsetB);
16211 if (PtrA1 == PtrB1 &&
16212 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16213 .abs() == 16)
16214 return true;
16215 }
16216 }
16217
16218 return false;
16219}
16220
16221/// Lower an interleaved store into a stN intrinsic.
16222///
16223/// E.g. Lower an interleaved store (Factor = 3):
16224/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16225/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16226/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16227///
16228/// Into:
16229/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16230/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16231/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16232/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16233///
16234/// Note that the new shufflevectors will be removed and we'll only generate one
16235/// st3 instruction in CodeGen.
16236///
16237/// Example for a more general valid mask (Factor 3). Lower:
16238/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16239/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16240/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16241///
16242/// Into:
16243/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16244/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16245/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16246/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16248 ShuffleVectorInst *SVI,
16249 unsigned Factor) const {
16250
16251 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16252 "Invalid interleave factor");
16253
16254 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16255 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16256
16257 unsigned LaneLen = VecTy->getNumElements() / Factor;
16258 Type *EltTy = VecTy->getElementType();
16259 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16260
16261 const DataLayout &DL = SI->getModule()->getDataLayout();
16262 bool UseScalable;
16263
16264 // Skip if we do not have NEON and skip illegal vector types. We can
16265 // "legalize" wide vector types into multiple interleaved accesses as long as
16266 // the vector types are divisible by 128.
16267 if (!Subtarget->hasNEON() ||
16268 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16269 return false;
16270
16271 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16272
16273 Value *Op0 = SVI->getOperand(0);
16274 Value *Op1 = SVI->getOperand(1);
16275 IRBuilder<> Builder(SI);
16276
16277 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16278 // vectors to integer vectors.
16279 if (EltTy->isPointerTy()) {
16280 Type *IntTy = DL.getIntPtrType(EltTy);
16281 unsigned NumOpElts =
16282 cast<FixedVectorType>(Op0->getType())->getNumElements();
16283
16284 // Convert to the corresponding integer vector.
16285 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16286 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16287 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16288
16289 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16290 }
16291
16292 // If we're going to generate more than one store, reset the lane length
16293 // and sub-vector type to something legal.
16294 LaneLen /= NumStores;
16295 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16296
16297 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16298 : SubVecTy;
16299
16300 // The base address of the store.
16301 Value *BaseAddr = SI->getPointerOperand();
16302
16303 auto Mask = SVI->getShuffleMask();
16304
16305 // Sanity check if all the indices are NOT in range.
16306 // If mask is `poison`, `Mask` may be a vector of -1s.
16307 // If all of them are `poison`, OOB read will happen later.
16308 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16309 return false;
16310 }
16311 // A 64bit st2 which does not start at element 0 will involved adding extra
16312 // ext elements making the st2 unprofitable, and if there is a nearby store
16313 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16314 // zip;ldp pair which has higher throughput.
16315 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16316 (Mask[0] != 0 ||
16317 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16318 DL) ||
16319 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16320 BaseAddr, DL)))
16321 return false;
16322
16323 Type *PtrTy = SI->getPointerOperandType();
16324 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16325 STVTy->getElementCount());
16326
16327 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16328 UseScalable, STVTy, PtrTy);
16329
16330 Value *PTrue = nullptr;
16331 if (UseScalable) {
16332 std::optional<unsigned> PgPattern =
16333 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16334 if (Subtarget->getMinSVEVectorSizeInBits() ==
16335 Subtarget->getMaxSVEVectorSizeInBits() &&
16336 Subtarget->getMinSVEVectorSizeInBits() ==
16337 DL.getTypeSizeInBits(SubVecTy))
16338 PgPattern = AArch64SVEPredPattern::all;
16339
16340 auto *PTruePat =
16341 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16342 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16343 {PTruePat});
16344 }
16345
16346 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16347
16349
16350 // Split the shufflevector operands into sub vectors for the new stN call.
16351 for (unsigned i = 0; i < Factor; i++) {
16352 Value *Shuffle;
16353 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16354 if (Mask[IdxI] >= 0) {
16355 Shuffle = Builder.CreateShuffleVector(
16356 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16357 } else {
16358 unsigned StartMask = 0;
16359 for (unsigned j = 1; j < LaneLen; j++) {
16360 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16361 if (Mask[IdxJ] >= 0) {
16362 StartMask = Mask[IdxJ] - j;
16363 break;
16364 }
16365 }
16366 // Note: Filling undef gaps with random elements is ok, since
16367 // those elements were being written anyway (with undefs).
16368 // In the case of all undefs we're defaulting to using elems from 0
16369 // Note: StartMask cannot be negative, it's checked in
16370 // isReInterleaveMask
16371 Shuffle = Builder.CreateShuffleVector(
16372 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16373 }
16374
16375 if (UseScalable)
16376 Shuffle = Builder.CreateInsertVector(
16377 STVTy, UndefValue::get(STVTy), Shuffle,
16378 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16379
16380 Ops.push_back(Shuffle);
16381 }
16382
16383 if (UseScalable)
16384 Ops.push_back(PTrue);
16385
16386 // If we generating more than one store, we compute the base address of
16387 // subsequent stores as an offset from the previous.
16388 if (StoreCount > 0)
16389 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16390 BaseAddr, LaneLen * Factor);
16391
16392 Ops.push_back(BaseAddr);
16393 Builder.CreateCall(StNFunc, Ops);
16394 }
16395 return true;
16396}
16397
16399 IntrinsicInst *DI, LoadInst *LI) const {
16400 // Only deinterleave2 supported at present.
16401 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16402 return false;
16403
16404 // Only a factor of 2 supported at present.
16405 const unsigned Factor = 2;
16406
16407 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16408 const DataLayout &DL = DI->getModule()->getDataLayout();
16409 bool UseScalable;
16410 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16411 return false;
16412
16413 // TODO: Add support for using SVE instructions with fixed types later, using
16414 // the code from lowerInterleavedLoad to obtain the correct container type.
16415 if (UseScalable && !VTy->isScalableTy())
16416 return false;
16417
16418 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16419
16420 VectorType *LdTy =
16422 VTy->getElementCount().divideCoefficientBy(NumLoads));
16423
16424 Type *PtrTy = LI->getPointerOperandType();
16425 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16426 UseScalable, LdTy, PtrTy);
16427
16428 IRBuilder<> Builder(LI);
16429
16430 Value *Pred = nullptr;
16431 if (UseScalable)
16432 Pred =
16433 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16434
16435 Value *BaseAddr = LI->getPointerOperand();
16436 Value *Result;
16437 if (NumLoads > 1) {
16438 Value *Left = PoisonValue::get(VTy);
16440
16441 for (unsigned I = 0; I < NumLoads; ++I) {
16442 Value *Offset = Builder.getInt64(I * Factor);
16443
16444 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16445 Value *LdN = nullptr;
16446 if (UseScalable)
16447 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16448 else
16449 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16450
16451 Value *Idx =
16452 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16453 Left = Builder.CreateInsertVector(
16454 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16455 Right = Builder.CreateInsertVector(
16456 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16457 }
16458
16459 Result = PoisonValue::get(DI->getType());
16460 Result = Builder.CreateInsertValue(Result, Left, 0);
16461 Result = Builder.CreateInsertValue(Result, Right, 1);
16462 } else {
16463 if (UseScalable)
16464 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16465 else
16466 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16467 }
16468
16469 DI->replaceAllUsesWith(Result);
16470 return true;
16471}
16472
16474 IntrinsicInst *II, StoreInst *SI) const {
16475 // Only interleave2 supported at present.
16476 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16477 return false;
16478
16479 // Only a factor of 2 supported at present.
16480 const unsigned Factor = 2;
16481
16482 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16483 const DataLayout &DL = II->getModule()->getDataLayout();
16484 bool UseScalable;
16485 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16486 return false;
16487
16488 // TODO: Add support for using SVE instructions with fixed types later, using
16489 // the code from lowerInterleavedStore to obtain the correct container type.
16490 if (UseScalable && !VTy->isScalableTy())
16491 return false;
16492
16493 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16494
16495 VectorType *StTy =
16497 VTy->getElementCount().divideCoefficientBy(NumStores));
16498
16499 Type *PtrTy = SI->getPointerOperandType();
16500 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16501 UseScalable, StTy, PtrTy);
16502
16503 IRBuilder<> Builder(SI);
16504
16505 Value *BaseAddr = SI->getPointerOperand();
16506 Value *Pred = nullptr;
16507
16508 if (UseScalable)
16509 Pred =
16510 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16511
16512 Value *L = II->getOperand(0);
16513 Value *R = II->getOperand(1);
16514
16515 for (unsigned I = 0; I < NumStores; ++I) {
16516 Value *Address = BaseAddr;
16517 if (NumStores > 1) {
16518 Value *Offset = Builder.getInt64(I * Factor);
16519 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16520
16521 Value *Idx =
16522 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16523 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16524 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16525 }
16526
16527 if (UseScalable)
16528 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16529 else
16530 Builder.CreateCall(StNFunc, {L, R, Address});
16531 }
16532
16533 return true;
16534}
16535
16537 const MemOp &Op, const AttributeList &FuncAttributes) const {
16538 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16539 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16540 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16541 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16542 // taken one instruction to materialize the v2i64 zero and one store (with
16543 // restrictive addressing mode). Just do i64 stores.
16544 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16545 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16546 if (Op.isAligned(AlignCheck))
16547 return true;
16548 unsigned Fast;
16549 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16551 Fast;
16552 };
16553
16554 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16555 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16556 return MVT::v16i8;
16557 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16558 return MVT::f128;
16559 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16560 return MVT::i64;
16561 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16562 return MVT::i32;
16563 return MVT::Other;
16564}
16565
16567 const MemOp &Op, const AttributeList &FuncAttributes) const {
16568 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16569 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16570 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16571 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16572 // taken one instruction to materialize the v2i64 zero and one store (with
16573 // restrictive addressing mode). Just do i64 stores.
16574 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16575 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16576 if (Op.isAligned(AlignCheck))
16577 return true;
16578 unsigned Fast;
16579 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16581 Fast;
16582 };
16583
16584 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16585 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16586 return LLT::fixed_vector(2, 64);
16587 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16588 return LLT::scalar(128);
16589 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16590 return LLT::scalar(64);
16591 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16592 return LLT::scalar(32);
16593 return LLT();
16594}
16595
16596// 12-bit optionally shifted immediates are legal for adds.
16598 if (Immed == std::numeric_limits<int64_t>::min()) {
16599 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16600 << ": avoid UB for INT64_MIN\n");
16601 return false;
16602 }
16603 // Same encoding for add/sub, just flip the sign.
16604 Immed = std::abs(Immed);
16605 bool IsLegal = ((Immed >> 12) == 0 ||
16606 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16607 LLVM_DEBUG(dbgs() << "Is " << Immed
16608 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16609 return IsLegal;
16610}
16611
16613 // We will only emit addvl/inc* instructions for SVE2
16614 if (!Subtarget->hasSVE2())
16615 return false;
16616
16617 // addvl's immediates are in terms of the number of bytes in a register.
16618 // Since there are 16 in the base supported size (128bits), we need to
16619 // divide the immediate by that much to give us a useful immediate to
16620 // multiply by vscale. We can't have a remainder as a result of this.
16621 if (Imm % 16 == 0)
16622 return isInt<6>(Imm / 16);
16623
16624 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16625 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16626 // of addvl as a result, so only take h|w|d into account.
16627 // Dec[h|w|d] will cover subtractions.
16628 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16629 // FIXME: Can we make use of other patterns to cover other immediates?
16630
16631 // inch|dech
16632 if (Imm % 8 == 0)
16633 return std::labs(Imm / 8) <= 16;
16634 // incw|decw
16635 if (Imm % 4 == 0)
16636 return std::labs(Imm / 4) <= 16;
16637 // incd|decd
16638 if (Imm % 2 == 0)
16639 return std::labs(Imm / 2) <= 16;
16640
16641 return false;
16642}
16643
16644// Return false to prevent folding
16645// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16646// if the folding leads to worse code.
16648 SDValue AddNode, SDValue ConstNode) const {
16649 // Let the DAGCombiner decide for vector types and large types.
16650 const EVT VT = AddNode.getValueType();
16651 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16652 return true;
16653
16654 // It is worse if c1 is legal add immediate, while c1*c2 is not
16655 // and has to be composed by at least two instructions.
16656 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16657 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16658 const int64_t C1 = C1Node->getSExtValue();
16659 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16661 return true;
16663 // Adapt to the width of a register.
16664 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16666 if (Insn.size() > 1)
16667 return false;
16668
16669 // Default to true and let the DAGCombiner decide.
16670 return true;
16671}
16672
16673// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16674// immediates is the same as for an add or a sub.
16676 return isLegalAddImmediate(Immed);
16677}
16678
16679/// isLegalAddressingMode - Return true if the addressing mode represented
16680/// by AM is legal for this target, for a load/store of the specified type.
16682 const AddrMode &AMode, Type *Ty,
16683 unsigned AS, Instruction *I) const {
16684 // AArch64 has five basic addressing modes:
16685 // reg
16686 // reg + 9-bit signed offset
16687 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16688 // reg1 + reg2
16689 // reg + SIZE_IN_BYTES * reg
16690
16691 // No global is ever allowed as a base.
16692 if (AMode.BaseGV)
16693 return false;
16694
16695 // No reg+reg+imm addressing.
16696 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16697 return false;
16698
16699 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16700 // `2*ScaledReg` into `BaseReg + ScaledReg`
16701 AddrMode AM = AMode;
16702 if (AM.Scale && !AM.HasBaseReg) {
16703 if (AM.Scale == 1) {
16704 AM.HasBaseReg = true;
16705 AM.Scale = 0;
16706 } else if (AM.Scale == 2) {
16707 AM.HasBaseReg = true;
16708 AM.Scale = 1;
16709 } else {
16710 return false;
16711 }
16712 }
16713
16714 // A base register is required in all addressing modes.
16715 if (!AM.HasBaseReg)
16716 return false;
16717
16718 if (Ty->isScalableTy()) {
16719 if (isa<ScalableVectorType>(Ty)) {
16720 // See if we have a foldable vscale-based offset, for vector types which
16721 // are either legal or smaller than the minimum; more work will be
16722 // required if we need to consider addressing for types which need
16723 // legalization by splitting.
16724 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16725 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16726 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16727 isPowerOf2_64(VecNumBytes))
16728 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16729
16730 uint64_t VecElemNumBytes =
16731 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16732 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16733 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16734 }
16735
16736 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16737 }
16738
16739 // No scalable offsets allowed for non-scalable types.
16740 if (AM.ScalableOffset)
16741 return false;
16742
16743 // check reg + imm case:
16744 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16745 uint64_t NumBytes = 0;
16746 if (Ty->isSized()) {
16747 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16748 NumBytes = NumBits / 8;
16749 if (!isPowerOf2_64(NumBits))
16750 NumBytes = 0;
16751 }
16752
16753 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16754 AM.Scale);
16755}
16756
16757// Check whether the 2 offsets belong to the same imm24 range, and their high
16758// 12bits are same, then their high part can be decoded with the offset of add.
16759int64_t
16761 int64_t MaxOffset) const {
16762 int64_t HighPart = MinOffset & ~0xfffULL;
16763 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16764 // Rebase the value to an integer multiple of imm12.
16765 return HighPart;
16766 }
16767
16768 return 0;
16769}
16770
16772 // Consider splitting large offset of struct or array.
16773 return true;
16774}
16775
16777 const MachineFunction &MF, EVT VT) const {
16778 VT = VT.getScalarType();
16779
16780 if (!VT.isSimple())
16781 return false;
16782
16783 switch (VT.getSimpleVT().SimpleTy) {
16784 case MVT::f16:
16785 return Subtarget->hasFullFP16();
16786 case MVT::f32:
16787 case MVT::f64:
16788 return true;
16789 default:
16790 break;
16791 }
16792
16793 return false;
16794}
16795
16797 Type *Ty) const {
16798 switch (Ty->getScalarType()->getTypeID()) {
16799 case Type::FloatTyID:
16800 case Type::DoubleTyID:
16801 return true;
16802 default:
16803 return false;
16804 }
16805}
16806
16808 EVT VT, CodeGenOptLevel OptLevel) const {
16809 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16811}
16812
16813const MCPhysReg *
16815 // LR is a callee-save register, but we must treat it as clobbered by any call
16816 // site. Hence we include LR in the scratch registers, which are in turn added
16817 // as implicit-defs for stackmaps and patchpoints.
16818 static const MCPhysReg ScratchRegs[] = {
16819 AArch64::X16, AArch64::X17, AArch64::LR, 0
16820 };
16821 return ScratchRegs;
16822}
16823
16825 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16826 return RCRegs;
16827}
16828
16829bool
16831 CombineLevel Level) const {
16832 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16833 N->getOpcode() == ISD::SRL) &&
16834 "Expected shift op");
16835
16836 SDValue ShiftLHS = N->getOperand(0);
16837 EVT VT = N->getValueType(0);
16838
16839 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16840 // combine it with shift 'N' to let it be lowered to UBFX except:
16841 // ((x >> C) & mask) << C.
16842 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16843 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16844 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16845 if (isMask_64(TruncMask)) {
16846 SDValue AndLHS = ShiftLHS.getOperand(0);
16847 if (AndLHS.getOpcode() == ISD::SRL) {
16848 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16849 if (N->getOpcode() == ISD::SHL)
16850 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16851 return SRLC->getZExtValue() == SHLC->getZExtValue();
16852 return false;
16853 }
16854 }
16855 }
16856 }
16857 return true;
16858}
16859
16861 const SDNode *N) const {
16862 assert(N->getOpcode() == ISD::XOR &&
16863 (N->getOperand(0).getOpcode() == ISD::SHL ||
16864 N->getOperand(0).getOpcode() == ISD::SRL) &&
16865 "Expected XOR(SHIFT) pattern");
16866
16867 // Only commute if the entire NOT mask is a hidden shifted mask.
16868 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16869 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16870 if (XorC && ShiftC) {
16871 unsigned MaskIdx, MaskLen;
16872 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16873 unsigned ShiftAmt = ShiftC->getZExtValue();
16874 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16875 if (N->getOperand(0).getOpcode() == ISD::SHL)
16876 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16877 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16878 }
16879 }
16880
16881 return false;
16882}
16883
16885 const SDNode *N, CombineLevel Level) const {
16886 assert(((N->getOpcode() == ISD::SHL &&
16887 N->getOperand(0).getOpcode() == ISD::SRL) ||
16888 (N->getOpcode() == ISD::SRL &&
16889 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16890 "Expected shift-shift mask");
16891 // Don't allow multiuse shift folding with the same shift amount.
16892 if (!N->getOperand(0)->hasOneUse())
16893 return false;
16894
16895 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16896 EVT VT = N->getValueType(0);
16897 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16898 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16899 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16900 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16901 }
16902
16903 return true;
16904}
16905
16907 unsigned BinOpcode, EVT VT) const {
16908 return VT.isScalableVector() && isTypeLegal(VT);
16909}
16910
16912 Type *Ty) const {
16913 assert(Ty->isIntegerTy());
16914
16915 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16916 if (BitSize == 0)
16917 return false;
16918
16919 int64_t Val = Imm.getSExtValue();
16920 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16921 return true;
16922
16923 if ((int64_t)Val < 0)
16924 Val = ~Val;
16925 if (BitSize == 32)
16926 Val &= (1LL << 32) - 1;
16927
16928 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16929 // MOVZ is free so return true for one or fewer MOVK.
16930 return Shift < 3;
16931}
16932
16934 unsigned Index) const {
16936 return false;
16937
16938 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16939}
16940
16941/// Turn vector tests of the signbit in the form of:
16942/// xor (sra X, elt_size(X)-1), -1
16943/// into:
16944/// cmge X, X, #0
16946 const AArch64Subtarget *Subtarget) {
16947 EVT VT = N->getValueType(0);
16948 if (!Subtarget->hasNEON() || !VT.isVector())
16949 return SDValue();
16950
16951 // There must be a shift right algebraic before the xor, and the xor must be a
16952 // 'not' operation.
16953 SDValue Shift = N->getOperand(0);
16954 SDValue Ones = N->getOperand(1);
16955 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16957 return SDValue();
16958
16959 // The shift should be smearing the sign bit across each vector element.
16960 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16961 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16962 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16963 return SDValue();
16964
16965 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16966}
16967
16968// Given a vecreduce_add node, detect the below pattern and convert it to the
16969// node sequence with UABDL, [S|U]ADB and UADDLP.
16970//
16971// i32 vecreduce_add(
16972// v16i32 abs(
16973// v16i32 sub(
16974// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16975// =================>
16976// i32 vecreduce_add(
16977// v4i32 UADDLP(
16978// v8i16 add(
16979// v8i16 zext(
16980// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16981// v8i16 zext(
16982// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16984 SelectionDAG &DAG) {
16985 // Assumed i32 vecreduce_add
16986 if (N->getValueType(0) != MVT::i32)
16987 return SDValue();
16988
16989 SDValue VecReduceOp0 = N->getOperand(0);
16990 unsigned Opcode = VecReduceOp0.getOpcode();
16991 // Assumed v16i32 abs
16992 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16993 return SDValue();
16994
16995 SDValue ABS = VecReduceOp0;
16996 // Assumed v16i32 sub
16997 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16998 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16999 return SDValue();
17000
17001 SDValue SUB = ABS->getOperand(0);
17002 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17003 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17004 // Assumed v16i32 type
17005 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17006 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17007 return SDValue();
17008
17009 // Assumed zext or sext
17010 bool IsZExt = false;
17011 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17012 IsZExt = true;
17013 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17014 IsZExt = false;
17015 } else
17016 return SDValue();
17017
17018 SDValue EXT0 = SUB->getOperand(0);
17019 SDValue EXT1 = SUB->getOperand(1);
17020 // Assumed zext's operand has v16i8 type
17021 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17022 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17023 return SDValue();
17024
17025 // Pattern is dectected. Let's convert it to sequence of nodes.
17026 SDLoc DL(N);
17027
17028 // First, create the node pattern of UABD/SABD.
17029 SDValue UABDHigh8Op0 =
17030 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17031 DAG.getConstant(8, DL, MVT::i64));
17032 SDValue UABDHigh8Op1 =
17033 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17034 DAG.getConstant(8, DL, MVT::i64));
17035 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17036 UABDHigh8Op0, UABDHigh8Op1);
17037 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17038
17039 // Second, create the node pattern of UABAL.
17040 SDValue UABDLo8Op0 =
17041 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17042 DAG.getConstant(0, DL, MVT::i64));
17043 SDValue UABDLo8Op1 =
17044 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17045 DAG.getConstant(0, DL, MVT::i64));
17046 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17047 UABDLo8Op0, UABDLo8Op1);
17048 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17049 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17050
17051 // Third, create the node of UADDLP.
17052 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17053
17054 // Fourth, create the node of VECREDUCE_ADD.
17055 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17056}
17057
17058// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17059// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17060// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17061// If we have vectors larger than v16i8 we extract v16i8 vectors,
17062// Follow the same steps above to get DOT instructions concatenate them
17063// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17065 const AArch64Subtarget *ST) {
17066 if (!ST->hasDotProd())
17068
17069 SDValue Op0 = N->getOperand(0);
17070 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17071 Op0.getValueType().getVectorElementType() != MVT::i32)
17072 return SDValue();
17073
17074 unsigned ExtOpcode = Op0.getOpcode();
17075 SDValue A = Op0;
17076 SDValue B;
17077 if (ExtOpcode == ISD::MUL) {
17078 A = Op0.getOperand(0);
17079 B = Op0.getOperand(1);
17080 if (A.getOpcode() != B.getOpcode() ||
17081 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17082 return SDValue();
17083 ExtOpcode = A.getOpcode();
17084 }
17085 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17086 return SDValue();
17087
17088 EVT Op0VT = A.getOperand(0).getValueType();
17089 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17090 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17091 if (!IsValidElementCount || !IsValidSize)
17092 return SDValue();
17093
17094 SDLoc DL(Op0);
17095 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17096 // the extend B.
17097 if (!B)
17098 B = DAG.getConstant(1, DL, Op0VT);
17099 else
17100 B = B.getOperand(0);
17101
17102 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17103 unsigned NumOfVecReduce;
17104 EVT TargetType;
17105 if (IsMultipleOf16) {
17106 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17107 TargetType = MVT::v4i32;
17108 } else {
17109 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17110 TargetType = MVT::v2i32;
17111 }
17112 auto DotOpcode =
17114 // Handle the case where we need to generate only one Dot operation.
17115 if (NumOfVecReduce == 1) {
17116 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17117 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17118 A.getOperand(0), B);
17119 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17120 }
17121 // Generate Dot instructions that are multiple of 16.
17122 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17123 SmallVector<SDValue, 4> SDotVec16;
17124 unsigned I = 0;
17125 for (; I < VecReduce16Num; I += 1) {
17126 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17127 SDValue Op0 =
17128 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17129 DAG.getConstant(I * 16, DL, MVT::i64));
17130 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17131 DAG.getConstant(I * 16, DL, MVT::i64));
17132 SDValue Dot =
17133 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17134 SDotVec16.push_back(Dot);
17135 }
17136 // Concatenate dot operations.
17137 EVT SDot16EVT =
17138 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17139 SDValue ConcatSDot16 =
17140 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17141 SDValue VecReduceAdd16 =
17142 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17143 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17144 if (VecReduce8Num == 0)
17145 return VecReduceAdd16;
17146
17147 // Generate the remainder Dot operation that is multiple of 8.
17148 SmallVector<SDValue, 4> SDotVec8;
17149 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17150 SDValue Vec8Op0 =
17151 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17152 DAG.getConstant(I * 16, DL, MVT::i64));
17153 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17154 DAG.getConstant(I * 16, DL, MVT::i64));
17155 SDValue Dot =
17156 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17157 SDValue VecReudceAdd8 =
17158 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17159 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17160 VecReudceAdd8);
17161}
17162
17163// Given an (integer) vecreduce, we know the order of the inputs does not
17164// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17165// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17166// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17168 auto DetectAddExtract = [&](SDValue A) {
17169 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17170 // UADDLP(x) if found.
17171 assert(A.getOpcode() == ISD::ADD);
17172 EVT VT = A.getValueType();
17173 SDValue Op0 = A.getOperand(0);
17174 SDValue Op1 = A.getOperand(1);
17175 if (Op0.getOpcode() != Op0.getOpcode() ||
17176 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17177 Op0.getOpcode() != ISD::SIGN_EXTEND))
17178 return SDValue();
17179 SDValue Ext0 = Op0.getOperand(0);
17180 SDValue Ext1 = Op1.getOperand(0);
17181 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17183 Ext0.getOperand(0) != Ext1.getOperand(0))
17184 return SDValue();
17185 // Check that the type is twice the add types, and the extract are from
17186 // upper/lower parts of the same source.
17188 VT.getVectorNumElements() * 2)
17189 return SDValue();
17190 if ((Ext0.getConstantOperandVal(1) != 0 ||
17192 (Ext1.getConstantOperandVal(1) != 0 ||
17194 return SDValue();
17195 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17197 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17198 };
17199
17200 if (SDValue R = DetectAddExtract(A))
17201 return R;
17202
17203 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17204 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17205 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17206 A.getOperand(1));
17207 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17208 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17209 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17210 A.getOperand(0));
17211 return SDValue();
17212}
17213
17214// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17215// UADDLV(concat), where the concat represents the 64-bit zext sources.
17217 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17218 // UADDLV(concat(zext, zext)) if found.
17219 assert(A.getOpcode() == ISD::ADD);
17220 EVT VT = A.getValueType();
17221 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17222 return SDValue();
17223 SDValue Op0 = A.getOperand(0);
17224 SDValue Op1 = A.getOperand(1);
17225 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17226 return SDValue();
17227 SDValue Ext0 = Op0.getOperand(0);
17228 SDValue Ext1 = Op1.getOperand(0);
17229 EVT ExtVT0 = Ext0.getValueType();
17230 EVT ExtVT1 = Ext1.getValueType();
17231 // Check zext VTs are the same and 64-bit length.
17232 if (ExtVT0 != ExtVT1 ||
17233 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17234 return SDValue();
17235 // Get VT for concat of zext sources.
17236 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17237 SDValue Concat =
17238 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17239
17240 switch (VT.getSimpleVT().SimpleTy) {
17241 case MVT::v2i64:
17242 case MVT::v4i32:
17243 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17244 case MVT::v8i16: {
17245 SDValue Uaddlv =
17246 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17247 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17248 }
17249 default:
17250 llvm_unreachable("Unhandled vector type");
17251 }
17252}
17253
17255 SDValue A = N->getOperand(0);
17256 if (A.getOpcode() == ISD::ADD) {
17257 if (SDValue R = performUADDVAddCombine(A, DAG))
17258 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17259 else if (SDValue R = performUADDVZextCombine(A, DAG))
17260 return R;
17261 }
17262 return SDValue();
17263}
17264
17267 const AArch64Subtarget *Subtarget) {
17268 if (DCI.isBeforeLegalizeOps())
17269 return SDValue();
17270
17271 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17272}
17273
17274SDValue
17275AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17276 SelectionDAG &DAG,
17277 SmallVectorImpl<SDNode *> &Created) const {
17279 if (isIntDivCheap(N->getValueType(0), Attr))
17280 return SDValue(N,0); // Lower SDIV as SDIV
17281
17282 EVT VT = N->getValueType(0);
17283
17284 // For scalable and fixed types, mark them as cheap so we can handle it much
17285 // later. This allows us to handle larger than legal types.
17286 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17287 return SDValue(N, 0);
17288
17289 // fold (sdiv X, pow2)
17290 if ((VT != MVT::i32 && VT != MVT::i64) ||
17291 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17292 return SDValue();
17293
17294 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17295}
17296
17297SDValue
17298AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17299 SelectionDAG &DAG,
17300 SmallVectorImpl<SDNode *> &Created) const {
17302 if (isIntDivCheap(N->getValueType(0), Attr))
17303 return SDValue(N, 0); // Lower SREM as SREM
17304
17305 EVT VT = N->getValueType(0);
17306
17307 // For scalable and fixed types, mark them as cheap so we can handle it much
17308 // later. This allows us to handle larger than legal types.
17309 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17310 return SDValue(N, 0);
17311
17312 // fold (srem X, pow2)
17313 if ((VT != MVT::i32 && VT != MVT::i64) ||
17314 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17315 return SDValue();
17316
17317 unsigned Lg2 = Divisor.countr_zero();
17318 if (Lg2 == 0)
17319 return SDValue();
17320
17321 SDLoc DL(N);
17322 SDValue N0 = N->getOperand(0);
17323 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17324 SDValue Zero = DAG.getConstant(0, DL, VT);
17325 SDValue CCVal, CSNeg;
17326 if (Lg2 == 1) {
17327 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17328 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17329 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17330
17331 Created.push_back(Cmp.getNode());
17332 Created.push_back(And.getNode());
17333 } else {
17334 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17335 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17336
17337 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17338 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17339 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17340 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17341 Negs.getValue(1));
17342
17343 Created.push_back(Negs.getNode());
17344 Created.push_back(AndPos.getNode());
17345 Created.push_back(AndNeg.getNode());
17346 }
17347
17348 return CSNeg;
17349}
17350
17351static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17352 switch(getIntrinsicID(S.getNode())) {
17353 default:
17354 break;
17355 case Intrinsic::aarch64_sve_cntb:
17356 return 8;
17357 case Intrinsic::aarch64_sve_cnth:
17358 return 16;
17359 case Intrinsic::aarch64_sve_cntw:
17360 return 32;
17361 case Intrinsic::aarch64_sve_cntd:
17362 return 64;
17363 }
17364 return {};
17365}
17366
17367/// Calculates what the pre-extend type is, based on the extension
17368/// operation node provided by \p Extend.
17369///
17370/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17371/// pre-extend type is pulled directly from the operand, while other extend
17372/// operations need a bit more inspection to get this information.
17373///
17374/// \param Extend The SDNode from the DAG that represents the extend operation
17375///
17376/// \returns The type representing the \p Extend source type, or \p MVT::Other
17377/// if no valid type can be determined
17379 switch (Extend.getOpcode()) {
17380 case ISD::SIGN_EXTEND:
17381 case ISD::ZERO_EXTEND:
17382 return Extend.getOperand(0).getValueType();
17383 case ISD::AssertSext:
17384 case ISD::AssertZext:
17386 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17387 if (!TypeNode)
17388 return MVT::Other;
17389 return TypeNode->getVT();
17390 }
17391 case ISD::AND: {
17393 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17394 if (!Constant)
17395 return MVT::Other;
17396
17397 uint32_t Mask = Constant->getZExtValue();
17398
17399 if (Mask == UCHAR_MAX)
17400 return MVT::i8;
17401 else if (Mask == USHRT_MAX)
17402 return MVT::i16;
17403 else if (Mask == UINT_MAX)
17404 return MVT::i32;
17405
17406 return MVT::Other;
17407 }
17408 default:
17409 return MVT::Other;
17410 }
17411}
17412
17413/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17414/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17415/// SExt/ZExt rather than the scalar SExt/ZExt
17417 EVT VT = BV.getValueType();
17418 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17420 return SDValue();
17421
17422 // Use the first item in the buildvector/shuffle to get the size of the
17423 // extend, and make sure it looks valid.
17424 SDValue Extend = BV->getOperand(0);
17425 unsigned ExtendOpcode = Extend.getOpcode();
17426 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17427 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17428 ExtendOpcode == ISD::AssertSext;
17429 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17430 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17431 return SDValue();
17432 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17433 // calculatePreExtendType will work without issue.
17434 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17435 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17436 return SDValue();
17437
17438 // Restrict valid pre-extend data type
17439 EVT PreExtendType = calculatePreExtendType(Extend);
17440 if (PreExtendType == MVT::Other ||
17441 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17442 return SDValue();
17443
17444 // Make sure all other operands are equally extended
17445 for (SDValue Op : drop_begin(BV->ops())) {
17446 if (Op.isUndef())
17447 continue;
17448 unsigned Opc = Op.getOpcode();
17449 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17450 Opc == ISD::AssertSext;
17451 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17452 return SDValue();
17453 }
17454
17455 SDValue NBV;
17456 SDLoc DL(BV);
17457 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17458 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17459 EVT PreExtendLegalType =
17460 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17462 for (SDValue Op : BV->ops())
17463 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17464 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17465 PreExtendLegalType));
17466 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17467 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17468 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17469 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17470 BV.getOperand(1).isUndef()
17471 ? DAG.getUNDEF(PreExtendVT)
17472 : BV.getOperand(1).getOperand(0),
17473 cast<ShuffleVectorSDNode>(BV)->getMask());
17474 }
17475 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17476}
17477
17478/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17479/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17481 // If the value type isn't a vector, none of the operands are going to be dups
17482 EVT VT = Mul->getValueType(0);
17483 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17484 return SDValue();
17485
17486 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17487 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17488
17489 // Neither operands have been changed, don't make any further changes
17490 if (!Op0 && !Op1)
17491 return SDValue();
17492
17493 SDLoc DL(Mul);
17494 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17495 Op1 ? Op1 : Mul->getOperand(1));
17496}
17497
17498// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17499// Same for other types with equivalent constants.
17501 EVT VT = N->getValueType(0);
17502 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17503 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17504 return SDValue();
17505 if (N->getOperand(0).getOpcode() != ISD::AND ||
17506 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17507 return SDValue();
17508
17509 SDValue And = N->getOperand(0);
17510 SDValue Srl = And.getOperand(0);
17511
17512 APInt V1, V2, V3;
17513 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17514 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17516 return SDValue();
17517
17518 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17519 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17520 V3 != (HalfSize - 1))
17521 return SDValue();
17522
17523 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17524 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17525 VT.getVectorElementCount() * 2);
17526
17527 SDLoc DL(N);
17528 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17529 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17530 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17531}
17532
17535 const AArch64Subtarget *Subtarget) {
17536
17537 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17538 return Ext;
17540 return Ext;
17541
17542 if (DCI.isBeforeLegalizeOps())
17543 return SDValue();
17544
17545 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17546 // and in MachineCombiner pass, add+mul will be combined into madd.
17547 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17548 SDLoc DL(N);
17549 EVT VT = N->getValueType(0);
17550 SDValue N0 = N->getOperand(0);
17551 SDValue N1 = N->getOperand(1);
17552 SDValue MulOper;
17553 unsigned AddSubOpc;
17554
17555 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17556 AddSubOpc = V->getOpcode();
17557 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17558 SDValue Opnd = V->getOperand(1);
17559 MulOper = V->getOperand(0);
17560 if (AddSubOpc == ISD::SUB)
17561 std::swap(Opnd, MulOper);
17562 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17563 return C->isOne();
17564 }
17565 return false;
17566 };
17567
17568 if (IsAddSubWith1(N0)) {
17569 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17570 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17571 }
17572
17573 if (IsAddSubWith1(N1)) {
17574 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17575 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17576 }
17577
17578 // The below optimizations require a constant RHS.
17579 if (!isa<ConstantSDNode>(N1))
17580 return SDValue();
17581
17582 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17583 const APInt &ConstValue = C->getAPIntValue();
17584
17585 // Allow the scaling to be folded into the `cnt` instruction by preventing
17586 // the scaling to be obscured here. This makes it easier to pattern match.
17587 if (IsSVECntIntrinsic(N0) ||
17588 (N0->getOpcode() == ISD::TRUNCATE &&
17589 (IsSVECntIntrinsic(N0->getOperand(0)))))
17590 if (ConstValue.sge(1) && ConstValue.sle(16))
17591 return SDValue();
17592
17593 // Multiplication of a power of two plus/minus one can be done more
17594 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17595 // future CPUs have a cheaper MADD instruction, this may need to be
17596 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17597 // 64-bit is 5 cycles, so this is always a win.
17598 // More aggressively, some multiplications N0 * C can be lowered to
17599 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17600 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17601 // TODO: lower more cases.
17602
17603 // TrailingZeroes is used to test if the mul can be lowered to
17604 // shift+add+shift.
17605 unsigned TrailingZeroes = ConstValue.countr_zero();
17606 if (TrailingZeroes) {
17607 // Conservatively do not lower to shift+add+shift if the mul might be
17608 // folded into smul or umul.
17609 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17610 isZeroExtended(N0, DAG)))
17611 return SDValue();
17612 // Conservatively do not lower to shift+add+shift if the mul might be
17613 // folded into madd or msub.
17614 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17615 N->use_begin()->getOpcode() == ISD::SUB))
17616 return SDValue();
17617 }
17618 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17619 // and shift+add+shift.
17620 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17621 unsigned ShiftAmt;
17622
17623 auto Shl = [&](SDValue N0, unsigned N1) {
17624 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17625 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17626 };
17627 auto Add = [&](SDValue N0, SDValue N1) {
17628 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17629 };
17630 auto Sub = [&](SDValue N0, SDValue N1) {
17631 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17632 };
17633 auto Negate = [&](SDValue N) {
17634 SDValue Zero = DAG.getConstant(0, DL, VT);
17635 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17636 };
17637
17638 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17639 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17640 // the (2^N - 1) can't be execused via a single instruction.
17641 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17642 unsigned BitWidth = C.getBitWidth();
17643 for (unsigned i = 1; i < BitWidth / 2; i++) {
17644 APInt Rem;
17645 APInt X(BitWidth, (1 << i) + 1);
17646 APInt::sdivrem(C, X, N, Rem);
17647 APInt NVMinus1 = N - 1;
17648 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17649 M = X;
17650 return true;
17651 }
17652 }
17653 return false;
17654 };
17655
17656 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
17657 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
17658 // the (2^N - 1) can't be execused via a single instruction.
17659 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
17660 APInt CVMinus1 = C - 1;
17661 if (CVMinus1.isNegative())
17662 return false;
17663 unsigned TrailingZeroes = CVMinus1.countr_zero();
17664 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
17665 if (SCVMinus1.isPowerOf2()) {
17666 unsigned BitWidth = SCVMinus1.getBitWidth();
17667 M = APInt(BitWidth, SCVMinus1.logBase2());
17668 N = APInt(BitWidth, TrailingZeroes);
17669 return true;
17670 }
17671 return false;
17672 };
17673
17674 if (ConstValue.isNonNegative()) {
17675 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17676 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17677 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17678 // (mul x, (2^M + 1) * (2^N + 1))
17679 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17680 // (mul x, (2^M + 1) * 2^N + 1))
17681 // => MV = add (shl x, M), x); add (shl MV, N), x)
17682 APInt SCVMinus1 = ShiftedConstValue - 1;
17683 APInt SCVPlus1 = ShiftedConstValue + 1;
17684 APInt CVPlus1 = ConstValue + 1;
17685 APInt CVM, CVN;
17686 if (SCVMinus1.isPowerOf2()) {
17687 ShiftAmt = SCVMinus1.logBase2();
17688 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17689 } else if (CVPlus1.isPowerOf2()) {
17690 ShiftAmt = CVPlus1.logBase2();
17691 return Sub(Shl(N0, ShiftAmt), N0);
17692 } else if (SCVPlus1.isPowerOf2()) {
17693 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17694 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17695 }
17696 if (Subtarget->hasALULSLFast() &&
17697 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17698 APInt CVMMinus1 = CVM - 1;
17699 APInt CVNMinus1 = CVN - 1;
17700 unsigned ShiftM1 = CVMMinus1.logBase2();
17701 unsigned ShiftN1 = CVNMinus1.logBase2();
17702 // ALULSLFast implicate that Shifts <= 4 places are fast
17703 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
17704 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17705 return Add(Shl(MVal, ShiftN1), MVal);
17706 }
17707 }
17708 if (Subtarget->hasALULSLFast() &&
17709 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
17710 unsigned ShiftM = CVM.getZExtValue();
17711 unsigned ShiftN = CVN.getZExtValue();
17712 // ALULSLFast implicate that Shifts <= 4 places are fast
17713 if (ShiftM <= 4 && ShiftN <= 4) {
17714 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
17715 return Add(Shl(MVal, CVN.getZExtValue()), N0);
17716 }
17717 }
17718 } else {
17719 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17720 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17721 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17722 APInt SCVPlus1 = -ShiftedConstValue + 1;
17723 APInt CVNegPlus1 = -ConstValue + 1;
17724 APInt CVNegMinus1 = -ConstValue - 1;
17725 if (CVNegPlus1.isPowerOf2()) {
17726 ShiftAmt = CVNegPlus1.logBase2();
17727 return Sub(N0, Shl(N0, ShiftAmt));
17728 } else if (CVNegMinus1.isPowerOf2()) {
17729 ShiftAmt = CVNegMinus1.logBase2();
17730 return Negate(Add(Shl(N0, ShiftAmt), N0));
17731 } else if (SCVPlus1.isPowerOf2()) {
17732 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17733 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17734 }
17735 }
17736
17737 return SDValue();
17738}
17739
17741 SelectionDAG &DAG) {
17742 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17743 // optimize away operation when it's from a constant.
17744 //
17745 // The general transformation is:
17746 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17747 // AND(VECTOR_CMP(x,y), constant2)
17748 // constant2 = UNARYOP(constant)
17749
17750 // Early exit if this isn't a vector operation, the operand of the
17751 // unary operation isn't a bitwise AND, or if the sizes of the operations
17752 // aren't the same.
17753 EVT VT = N->getValueType(0);
17754 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17755 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17756 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17757 return SDValue();
17758
17759 // Now check that the other operand of the AND is a constant. We could
17760 // make the transformation for non-constant splats as well, but it's unclear
17761 // that would be a benefit as it would not eliminate any operations, just
17762 // perform one more step in scalar code before moving to the vector unit.
17763 if (BuildVectorSDNode *BV =
17764 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17765 // Bail out if the vector isn't a constant.
17766 if (!BV->isConstant())
17767 return SDValue();
17768
17769 // Everything checks out. Build up the new and improved node.
17770 SDLoc DL(N);
17771 EVT IntVT = BV->getValueType(0);
17772 // Create a new constant of the appropriate type for the transformed
17773 // DAG.
17774 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17775 // The AND node needs bitcasts to/from an integer vector type around it.
17776 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17777 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17778 N->getOperand(0)->getOperand(0), MaskConst);
17779 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17780 return Res;
17781 }
17782
17783 return SDValue();
17784}
17785
17787 const AArch64Subtarget *Subtarget) {
17788 // First try to optimize away the conversion when it's conditionally from
17789 // a constant. Vectors only.
17791 return Res;
17792
17793 EVT VT = N->getValueType(0);
17794 if (VT != MVT::f32 && VT != MVT::f64)
17795 return SDValue();
17796
17797 // Only optimize when the source and destination types have the same width.
17798 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17799 return SDValue();
17800
17801 // If the result of an integer load is only used by an integer-to-float
17802 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17803 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17804 SDValue N0 = N->getOperand(0);
17805 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17806 N0.hasOneUse() &&
17807 // Do not change the width of a volatile load.
17808 !cast<LoadSDNode>(N0)->isVolatile()) {
17809 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17810 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17811 LN0->getPointerInfo(), LN0->getAlign(),
17812 LN0->getMemOperand()->getFlags());
17813
17814 // Make sure successors of the original load stay after it by updating them
17815 // to use the new Chain.
17816 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17817
17818 unsigned Opcode =
17820 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17821 }
17822
17823 return SDValue();
17824}
17825
17826/// Fold a floating-point multiply by power of two into floating-point to
17827/// fixed-point conversion.
17830 const AArch64Subtarget *Subtarget) {
17831 if (!Subtarget->isNeonAvailable())
17832 return SDValue();
17833
17834 if (!N->getValueType(0).isSimple())
17835 return SDValue();
17836
17837 SDValue Op = N->getOperand(0);
17838 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17839 return SDValue();
17840
17841 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17842 return SDValue();
17843
17844 SDValue ConstVec = Op->getOperand(1);
17845 if (!isa<BuildVectorSDNode>(ConstVec))
17846 return SDValue();
17847
17848 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17849 uint32_t FloatBits = FloatTy.getSizeInBits();
17850 if (FloatBits != 32 && FloatBits != 64 &&
17851 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17852 return SDValue();
17853
17854 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17855 uint32_t IntBits = IntTy.getSizeInBits();
17856 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17857 return SDValue();
17858
17859 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17860 if (IntBits > FloatBits)
17861 return SDValue();
17862
17863 BitVector UndefElements;
17864 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17865 int32_t Bits = IntBits == 64 ? 64 : 32;
17866 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17867 if (C == -1 || C == 0 || C > Bits)
17868 return SDValue();
17869
17870 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17871 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17872 return SDValue();
17873
17874 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17875 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17876 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17877 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17878 return SDValue();
17879 }
17880
17881 SDLoc DL(N);
17882 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17883 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17884 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17885 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17886 SDValue FixConv =
17888 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17889 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17890 // We can handle smaller integers by generating an extra trunc.
17891 if (IntBits < FloatBits)
17892 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17893
17894 return FixConv;
17895}
17896
17897/// Fold a floating-point divide by power of two into fixed-point to
17898/// floating-point conversion.
17901 const AArch64Subtarget *Subtarget) {
17902 if (!Subtarget->hasNEON())
17903 return SDValue();
17904
17905 SDValue Op = N->getOperand(0);
17906 unsigned Opc = Op->getOpcode();
17907 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17908 !Op.getOperand(0).getValueType().isSimple() ||
17909 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17910 return SDValue();
17911
17912 SDValue ConstVec = N->getOperand(1);
17913 if (!isa<BuildVectorSDNode>(ConstVec))
17914 return SDValue();
17915
17916 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17917 int32_t IntBits = IntTy.getSizeInBits();
17918 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17919 return SDValue();
17920
17921 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17922 int32_t FloatBits = FloatTy.getSizeInBits();
17923 if (FloatBits != 32 && FloatBits != 64)
17924 return SDValue();
17925
17926 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17927 if (IntBits > FloatBits)
17928 return SDValue();
17929
17930 BitVector UndefElements;
17931 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17932 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17933 if (C == -1 || C == 0 || C > FloatBits)
17934 return SDValue();
17935
17936 MVT ResTy;
17937 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17938 switch (NumLanes) {
17939 default:
17940 return SDValue();
17941 case 2:
17942 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17943 break;
17944 case 4:
17945 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17946 break;
17947 }
17948
17949 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17950 return SDValue();
17951
17952 SDLoc DL(N);
17953 SDValue ConvInput = Op.getOperand(0);
17954 bool IsSigned = Opc == ISD::SINT_TO_FP;
17955 if (IntBits < FloatBits)
17956 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17957 ResTy, ConvInput);
17958
17959 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17960 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17961 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17962 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17963 DAG.getConstant(C, DL, MVT::i32));
17964}
17965
17967 const AArch64TargetLowering &TLI) {
17968 EVT VT = N->getValueType(0);
17969 SelectionDAG &DAG = DCI.DAG;
17970 SDLoc DL(N);
17971 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17972
17973 if (!VT.isVector())
17974 return SDValue();
17975
17976 // The combining code works for NEON, SVE2 and SME.
17977 if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
17978 (VT.isScalableVector() && !Subtarget.hasSVE2()))
17979 return SDValue();
17980
17981 SDValue N0 = N->getOperand(0);
17982 if (N0.getOpcode() != ISD::AND)
17983 return SDValue();
17984
17985 SDValue N1 = N->getOperand(1);
17986 if (N1.getOpcode() != ISD::AND)
17987 return SDValue();
17988
17989 // InstCombine does (not (neg a)) => (add a -1).
17990 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17991 // Loop over all combinations of AND operands.
17992 for (int i = 1; i >= 0; --i) {
17993 for (int j = 1; j >= 0; --j) {
17994 SDValue O0 = N0->getOperand(i);
17995 SDValue O1 = N1->getOperand(j);
17996 SDValue Sub, Add, SubSibling, AddSibling;
17997
17998 // Find a SUB and an ADD operand, one from each AND.
17999 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18000 Sub = O0;
18001 Add = O1;
18002 SubSibling = N0->getOperand(1 - i);
18003 AddSibling = N1->getOperand(1 - j);
18004 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18005 Add = O0;
18006 Sub = O1;
18007 AddSibling = N0->getOperand(1 - i);
18008 SubSibling = N1->getOperand(1 - j);
18009 } else
18010 continue;
18011
18013 continue;
18014
18015 // Constant ones is always righthand operand of the Add.
18016 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18017 continue;
18018
18019 if (Sub.getOperand(1) != Add.getOperand(0))
18020 continue;
18021
18022 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18023 }
18024 }
18025
18026 // (or (and a b) (and (not a) c)) => (bsl a b c)
18027 // We only have to look for constant vectors here since the general, variable
18028 // case can be handled in TableGen.
18029 unsigned Bits = VT.getScalarSizeInBits();
18030 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18031 for (int i = 1; i >= 0; --i)
18032 for (int j = 1; j >= 0; --j) {
18033 APInt Val1, Val2;
18034
18035 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18037 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18038 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18039 N0->getOperand(1 - i), N1->getOperand(1 - j));
18040 }
18041 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18042 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18043 if (!BVN0 || !BVN1)
18044 continue;
18045
18046 bool FoundMatch = true;
18047 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18048 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18049 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18050 if (!CN0 || !CN1 ||
18051 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18052 FoundMatch = false;
18053 break;
18054 }
18055 }
18056 if (FoundMatch)
18057 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18058 N0->getOperand(1 - i), N1->getOperand(1 - j));
18059 }
18060
18061 return SDValue();
18062}
18063
18064// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18065// convert to csel(ccmp(.., cc0)), depending on cc1:
18066
18067// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18068// =>
18069// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18070//
18071// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18072// =>
18073// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18075 EVT VT = N->getValueType(0);
18076 SDValue CSel0 = N->getOperand(0);
18077 SDValue CSel1 = N->getOperand(1);
18078
18079 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18080 CSel1.getOpcode() != AArch64ISD::CSEL)
18081 return SDValue();
18082
18083 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18084 return SDValue();
18085
18086 if (!isNullConstant(CSel0.getOperand(0)) ||
18087 !isOneConstant(CSel0.getOperand(1)) ||
18088 !isNullConstant(CSel1.getOperand(0)) ||
18089 !isOneConstant(CSel1.getOperand(1)))
18090 return SDValue();
18091
18092 SDValue Cmp0 = CSel0.getOperand(3);
18093 SDValue Cmp1 = CSel1.getOperand(3);
18096 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18097 return SDValue();
18098 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18099 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18100 std::swap(Cmp0, Cmp1);
18101 std::swap(CC0, CC1);
18102 }
18103
18104 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18105 return SDValue();
18106
18107 SDLoc DL(N);
18108 SDValue CCmp, Condition;
18109 unsigned NZCV;
18110
18111 if (N->getOpcode() == ISD::AND) {
18113 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18115 } else {
18117 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18119 }
18120
18121 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18122
18123 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18124 if (Op1 && Op1->getAPIntValue().isNegative() &&
18125 Op1->getAPIntValue().sgt(-32)) {
18126 // CCMP accept the constant int the range [0, 31]
18127 // if the Op1 is a constant in the range [-31, -1], we
18128 // can select to CCMN to avoid the extra mov
18129 SDValue AbsOp1 =
18130 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18131 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18132 NZCVOp, Condition, Cmp0);
18133 } else {
18134 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18135 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18136 }
18137 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18138 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18139 CCmp);
18140}
18141
18143 const AArch64Subtarget *Subtarget,
18144 const AArch64TargetLowering &TLI) {
18145 SelectionDAG &DAG = DCI.DAG;
18146 EVT VT = N->getValueType(0);
18147
18148 if (SDValue R = performANDORCSELCombine(N, DAG))
18149 return R;
18150
18151 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18152 return SDValue();
18153
18154 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18155 return Res;
18156
18157 return SDValue();
18158}
18159
18161 if (!MemVT.getVectorElementType().isSimple())
18162 return false;
18163
18164 uint64_t MaskForTy = 0ull;
18165 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18166 case MVT::i8:
18167 MaskForTy = 0xffull;
18168 break;
18169 case MVT::i16:
18170 MaskForTy = 0xffffull;
18171 break;
18172 case MVT::i32:
18173 MaskForTy = 0xffffffffull;
18174 break;
18175 default:
18176 return false;
18177 break;
18178 }
18179
18180 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18181 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18182 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18183
18184 return false;
18185}
18186
18188 SDValue LeafOp = SDValue(N, 0);
18189 SDValue Op = N->getOperand(0);
18190 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18191 LeafOp.getValueType() != Op.getValueType())
18192 Op = Op->getOperand(0);
18193 if (LeafOp.getValueType() == Op.getValueType())
18194 return Op;
18195 return SDValue();
18196}
18197
18200 SelectionDAG &DAG = DCI.DAG;
18201 SDValue Src = N->getOperand(0);
18202 unsigned Opc = Src->getOpcode();
18203
18204 // Zero/any extend of an unsigned unpack
18205 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18206 SDValue UnpkOp = Src->getOperand(0);
18207 SDValue Dup = N->getOperand(1);
18208
18209 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18210 return SDValue();
18211
18212 SDLoc DL(N);
18213 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18214 if (!C)
18215 return SDValue();
18216
18217 uint64_t ExtVal = C->getZExtValue();
18218
18219 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18220 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18221 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18222 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18223 };
18224
18225 // If the mask is fully covered by the unpack, we don't need to push
18226 // a new AND onto the operand
18227 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18228 if (MaskAndTypeMatch(EltTy))
18229 return Src;
18230
18231 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18232 // to see if the mask is all-ones of size MemTy.
18233 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18234 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18235 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18236 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18237 if (MaskAndTypeMatch(EltTy))
18238 return Src;
18239 }
18240
18241 // Truncate to prevent a DUP with an over wide constant
18242 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18243
18244 // Otherwise, make sure we propagate the AND to the operand
18245 // of the unpack
18246 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18247 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18248
18249 SDValue And = DAG.getNode(ISD::AND, DL,
18250 UnpkOp->getValueType(0), UnpkOp, Dup);
18251
18252 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18253 }
18254
18255 if (DCI.isBeforeLegalizeOps())
18256 return SDValue();
18257
18258 // If both sides of AND operations are i1 splat_vectors then
18259 // we can produce just i1 splat_vector as the result.
18260 if (isAllActivePredicate(DAG, N->getOperand(0)))
18261 return N->getOperand(1);
18262 if (isAllActivePredicate(DAG, N->getOperand(1)))
18263 return N->getOperand(0);
18264
18266 return SDValue();
18267
18268 SDValue Mask = N->getOperand(1);
18269
18270 if (!Src.hasOneUse())
18271 return SDValue();
18272
18273 EVT MemVT;
18274
18275 // SVE load instructions perform an implicit zero-extend, which makes them
18276 // perfect candidates for combining.
18277 switch (Opc) {
18281 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18282 break;
18298 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18299 break;
18300 default:
18301 return SDValue();
18302 }
18303
18304 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18305 return Src;
18306
18307 return SDValue();
18308}
18309
18310// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18313
18314 // This function performs an optimization on a specific pattern involving
18315 // an AND operation and SETCC (Set Condition Code) node.
18316
18317 SDValue SetCC = N->getOperand(0);
18318 EVT VT = N->getValueType(0);
18319 SelectionDAG &DAG = DCI.DAG;
18320
18321 // Checks if the current node (N) is used by any SELECT instruction and
18322 // returns an empty SDValue to avoid applying the optimization to prevent
18323 // incorrect results
18324 for (auto U : N->uses())
18325 if (U->getOpcode() == ISD::SELECT)
18326 return SDValue();
18327
18328 // Check if the operand is a SETCC node with floating-point comparison
18329 if (SetCC.getOpcode() == ISD::SETCC &&
18330 SetCC.getOperand(0).getValueType() == MVT::f32) {
18331
18332 SDValue Cmp;
18334
18335 // Check if the DAG is after legalization and if we can emit the conjunction
18336 if (!DCI.isBeforeLegalize() &&
18337 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18338
18340
18341 SDLoc DL(N);
18342 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18343 DAG.getConstant(0, DL, VT),
18344 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18345 }
18346 }
18347 return SDValue();
18348}
18349
18352 SelectionDAG &DAG = DCI.DAG;
18353 SDValue LHS = N->getOperand(0);
18354 SDValue RHS = N->getOperand(1);
18355 EVT VT = N->getValueType(0);
18356
18357 if (SDValue R = performANDORCSELCombine(N, DAG))
18358 return R;
18359
18360 if (SDValue R = performANDSETCCCombine(N,DCI))
18361 return R;
18362
18363 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18364 return SDValue();
18365
18366 if (VT.isScalableVector())
18367 return performSVEAndCombine(N, DCI);
18368
18369 // The combining code below works only for NEON vectors. In particular, it
18370 // does not work for SVE when dealing with vectors wider than 128 bits.
18371 if (!VT.is64BitVector() && !VT.is128BitVector())
18372 return SDValue();
18373
18374 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18375 if (!BVN)
18376 return SDValue();
18377
18378 // AND does not accept an immediate, so check if we can use a BIC immediate
18379 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18380 // pattern in isel, because some immediates may be lowered to the preferred
18381 // (and x, (movi imm)) form, even though an mvni representation also exists.
18382 APInt DefBits(VT.getSizeInBits(), 0);
18383 APInt UndefBits(VT.getSizeInBits(), 0);
18384 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18385 SDValue NewOp;
18386
18387 // Any bits known to already be 0 need not be cleared again, which can help
18388 // reduce the size of the immediate to one supported by the instruction.
18389 KnownBits Known = DAG.computeKnownBits(LHS);
18390 APInt ZeroSplat(VT.getSizeInBits(), 0);
18391 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18392 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18393 << (Known.Zero.getBitWidth() * I);
18394
18395 DefBits = ~(DefBits | ZeroSplat);
18396 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18397 DefBits, &LHS)) ||
18398 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18399 DefBits, &LHS)))
18400 return NewOp;
18401
18402 UndefBits = ~(UndefBits | ZeroSplat);
18403 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18404 UndefBits, &LHS)) ||
18405 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18406 UndefBits, &LHS)))
18407 return NewOp;
18408 }
18409
18410 return SDValue();
18411}
18412
18415 SelectionDAG &DAG = DCI.DAG;
18416 SDValue LHS = N->getOperand(0);
18417 SDValue RHS = N->getOperand(1);
18418 EVT VT = N->getValueType(0);
18419 SDLoc DL(N);
18420
18421 if (!N->getFlags().hasAllowReassociation())
18422 return SDValue();
18423
18424 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18425 auto ReassocComplex = [&](SDValue A, SDValue B) {
18426 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18427 return SDValue();
18428 unsigned Opc = A.getConstantOperandVal(0);
18429 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18430 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18431 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18432 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18433 return SDValue();
18434 SDValue VCMLA = DAG.getNode(
18435 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18436 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18437 A.getOperand(2), A.getOperand(3));
18438 VCMLA->setFlags(A->getFlags());
18439 return VCMLA;
18440 };
18441 if (SDValue R = ReassocComplex(LHS, RHS))
18442 return R;
18443 if (SDValue R = ReassocComplex(RHS, LHS))
18444 return R;
18445
18446 return SDValue();
18447}
18448
18449static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18450 switch (Opcode) {
18451 case ISD::STRICT_FADD:
18452 case ISD::FADD:
18453 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18454 case ISD::ADD:
18455 return VT == MVT::i64;
18456 default:
18457 return false;
18458 }
18459}
18460
18461static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18463
18465 if ((N.getOpcode() == ISD::SETCC) ||
18466 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18467 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18468 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18469 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18470 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18471 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18472 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18473 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18474 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18475 // get_active_lane_mask is lowered to a whilelo instruction.
18476 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18477 return true;
18478
18479 return false;
18480}
18481
18482// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18483// ... into: "ptrue p, all" + PTEST
18484static SDValue
18487 const AArch64Subtarget *Subtarget) {
18488 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18489 // Make sure PTEST can be legalised with illegal types.
18490 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18491 return SDValue();
18492
18493 SDValue N0 = N->getOperand(0);
18494 EVT VT = N0.getValueType();
18495
18496 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18497 !isNullConstant(N->getOperand(1)))
18498 return SDValue();
18499
18500 // Restricted the DAG combine to only cases where we're extracting from a
18501 // flag-setting operation.
18502 if (!isPredicateCCSettingOp(N0))
18503 return SDValue();
18504
18505 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18506 SelectionDAG &DAG = DCI.DAG;
18507 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18508 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18509}
18510
18511// Materialize : Idx = (add (mul vscale, NumEls), -1)
18512// i1 = extract_vector_elt t37, Constant:i64<Idx>
18513// ... into: "ptrue p, all" + PTEST
18514static SDValue
18517 const AArch64Subtarget *Subtarget) {
18518 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18519 // Make sure PTEST is legal types.
18520 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18521 return SDValue();
18522
18523 SDValue N0 = N->getOperand(0);
18524 EVT OpVT = N0.getValueType();
18525
18526 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18527 return SDValue();
18528
18529 // Idx == (add (mul vscale, NumEls), -1)
18530 SDValue Idx = N->getOperand(1);
18531 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18532 return SDValue();
18533
18534 SDValue VS = Idx.getOperand(0);
18535 if (VS.getOpcode() != ISD::VSCALE)
18536 return SDValue();
18537
18538 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18539 if (VS.getConstantOperandVal(0) != NumEls)
18540 return SDValue();
18541
18542 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18543 SelectionDAG &DAG = DCI.DAG;
18544 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18545 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18546}
18547
18548static SDValue
18550 const AArch64Subtarget *Subtarget) {
18551 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18552 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18553 return Res;
18554 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18555 return Res;
18556
18557 SelectionDAG &DAG = DCI.DAG;
18558 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18559
18560 EVT VT = N->getValueType(0);
18561 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18562 bool IsStrict = N0->isStrictFPOpcode();
18563
18564 // extract(dup x) -> x
18565 if (N0.getOpcode() == AArch64ISD::DUP)
18566 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18567 : N0.getOperand(0);
18568
18569 // Rewrite for pairwise fadd pattern
18570 // (f32 (extract_vector_elt
18571 // (fadd (vXf32 Other)
18572 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18573 // ->
18574 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18575 // (extract_vector_elt (vXf32 Other) 1))
18576 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18577 // we can only do this when it's used only by the extract_vector_elt.
18578 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18579 (!IsStrict || N0.hasOneUse())) {
18580 SDLoc DL(N0);
18581 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18582 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18583
18584 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18585 SDValue Other = N00;
18586
18587 // And handle the commutative case.
18588 if (!Shuffle) {
18589 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18590 Other = N01;
18591 }
18592
18593 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18594 Other == Shuffle->getOperand(0)) {
18595 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18596 DAG.getConstant(0, DL, MVT::i64));
18597 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18598 DAG.getConstant(1, DL, MVT::i64));
18599 if (!IsStrict)
18600 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18601
18602 // For strict_fadd we need uses of the final extract_vector to be replaced
18603 // with the strict_fadd, but we also need uses of the chain output of the
18604 // original strict_fadd to use the chain output of the new strict_fadd as
18605 // otherwise it may not be deleted.
18606 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18607 {VT, MVT::Other},
18608 {N0->getOperand(0), Extract1, Extract2});
18609 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18610 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18611 return SDValue(N, 0);
18612 }
18613 }
18614
18615 return SDValue();
18616}
18617
18620 SelectionDAG &DAG) {
18621 SDLoc dl(N);
18622 EVT VT = N->getValueType(0);
18623 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18624 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18625
18626 if (VT.isScalableVector())
18627 return SDValue();
18628
18629 // Optimize concat_vectors of truncated vectors, where the intermediate
18630 // type is illegal, to avoid said illegality, e.g.,
18631 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18632 // (v2i16 (truncate (v2i64)))))
18633 // ->
18634 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18635 // (v4i32 (bitcast (v2i64))),
18636 // <0, 2, 4, 6>)))
18637 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18638 // on both input and result type, so we might generate worse code.
18639 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18640 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18641 N1Opc == ISD::TRUNCATE) {
18642 SDValue N00 = N0->getOperand(0);
18643 SDValue N10 = N1->getOperand(0);
18644 EVT N00VT = N00.getValueType();
18645
18646 if (N00VT == N10.getValueType() &&
18647 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18648 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18649 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18651 for (size_t i = 0; i < Mask.size(); ++i)
18652 Mask[i] = i * 2;
18653 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18654 DAG.getVectorShuffle(
18655 MidVT, dl,
18656 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18657 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18658 }
18659 }
18660
18661 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18662 N->getOperand(0).getValueType() == MVT::v2i16 ||
18663 N->getOperand(0).getValueType() == MVT::v2i8) {
18664 EVT SrcVT = N->getOperand(0).getValueType();
18665 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18666 // loads to prevent having to go through the v4i8 load legalization that
18667 // needs to extend each element into a larger type.
18668 if (N->getNumOperands() % 2 == 0 &&
18669 all_of(N->op_values(), [SrcVT](SDValue V) {
18670 if (V.getValueType() != SrcVT)
18671 return false;
18672 if (V.isUndef())
18673 return true;
18674 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18675 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18676 LD->getExtensionType() == ISD::NON_EXTLOAD;
18677 })) {
18678 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18679 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18681
18682 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18683 SDValue V = N->getOperand(i);
18684 if (V.isUndef())
18685 Ops.push_back(DAG.getUNDEF(FVT));
18686 else {
18687 LoadSDNode *LD = cast<LoadSDNode>(V);
18688 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18689 LD->getBasePtr(), LD->getMemOperand());
18690 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18691 Ops.push_back(NewLoad);
18692 }
18693 }
18694 return DAG.getBitcast(N->getValueType(0),
18695 DAG.getBuildVector(NVT, dl, Ops));
18696 }
18697 }
18698
18699 // Canonicalise concat_vectors to replace concatenations of truncated nots
18700 // with nots of concatenated truncates. This in some cases allows for multiple
18701 // redundant negations to be eliminated.
18702 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18703 // (v4i16 (truncate (not (v4i32)))))
18704 // ->
18705 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18706 // (v4i16 (truncate (v4i32)))))
18707 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18708 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18709 N->isOnlyUserOf(N1.getNode())) {
18710 auto isBitwiseVectorNegate = [](SDValue V) {
18711 return V->getOpcode() == ISD::XOR &&
18712 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18713 };
18714 SDValue N00 = N0->getOperand(0);
18715 SDValue N10 = N1->getOperand(0);
18716 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18717 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18718 return DAG.getNOT(
18719 dl,
18720 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18721 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18722 N00->getOperand(0)),
18723 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18724 N10->getOperand(0))),
18725 VT);
18726 }
18727 }
18728
18729 // Wait till after everything is legalized to try this. That way we have
18730 // legal vector types and such.
18731 if (DCI.isBeforeLegalizeOps())
18732 return SDValue();
18733
18734 // Optimise concat_vectors of two identical binops with a 128-bit destination
18735 // size, combine into an binop of two contacts of the source vectors. eg:
18736 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
18737 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18738 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
18739 N1->hasOneUse()) {
18740 SDValue N00 = N0->getOperand(0);
18741 SDValue N01 = N0->getOperand(1);
18742 SDValue N10 = N1->getOperand(0);
18743 SDValue N11 = N1->getOperand(1);
18744
18745 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18746 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18747 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18748 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18749 }
18750 }
18751
18752 auto IsRSHRN = [](SDValue Shr) {
18753 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18754 return false;
18755 SDValue Op = Shr.getOperand(0);
18756 EVT VT = Op.getValueType();
18757 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18758 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18759 return false;
18760
18761 APInt Imm;
18762 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18763 Imm = APInt(VT.getScalarSizeInBits(),
18764 Op.getOperand(1).getConstantOperandVal(0)
18765 << Op.getOperand(1).getConstantOperandVal(1));
18766 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18767 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18768 Imm = APInt(VT.getScalarSizeInBits(),
18769 Op.getOperand(1).getConstantOperandVal(0));
18770 else
18771 return false;
18772
18773 if (Imm != 1ULL << (ShtAmt - 1))
18774 return false;
18775 return true;
18776 };
18777
18778 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18779 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18780 ((IsRSHRN(N1) &&
18782 N1.isUndef())) {
18783 SDValue X = N0.getOperand(0).getOperand(0);
18784 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18785 : N1.getOperand(0).getOperand(0);
18786 EVT BVT =
18787 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18788 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18789 SDValue Add = DAG.getNode(
18790 ISD::ADD, dl, BVT, CC,
18791 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18792 SDValue Shr =
18793 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18794 return Shr;
18795 }
18796
18797 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18798 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18799 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18800 N0.getOperand(1) == N1.getOperand(1)) {
18801 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18802 DAG.getUNDEF(N0.getValueType()));
18803 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18804 DAG.getUNDEF(N0.getValueType()));
18805 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18806 }
18807
18808 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18809 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18810 // canonicalise to that.
18811 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18812 assert(VT.getScalarSizeInBits() == 64);
18813 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18814 DAG.getConstant(0, dl, MVT::i64));
18815 }
18816
18817 // Canonicalise concat_vectors so that the right-hand vector has as few
18818 // bit-casts as possible before its real operation. The primary matching
18819 // destination for these operations will be the narrowing "2" instructions,
18820 // which depend on the operation being performed on this right-hand vector.
18821 // For example,
18822 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18823 // becomes
18824 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18825
18826 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18827 return SDValue();
18828 SDValue RHS = N1->getOperand(0);
18829 MVT RHSTy = RHS.getValueType().getSimpleVT();
18830 // If the RHS is not a vector, this is not the pattern we're looking for.
18831 if (!RHSTy.isVector())
18832 return SDValue();
18833
18834 LLVM_DEBUG(
18835 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18836
18837 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18838 RHSTy.getVectorNumElements() * 2);
18839 return DAG.getNode(ISD::BITCAST, dl, VT,
18840 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18841 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18842 RHS));
18843}
18844
18845static SDValue
18847 SelectionDAG &DAG) {
18848 if (DCI.isBeforeLegalizeOps())
18849 return SDValue();
18850
18851 EVT VT = N->getValueType(0);
18852 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18853 return SDValue();
18854
18855 SDValue V = N->getOperand(0);
18856
18857 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18858 // blocks this combine because the non-const case requires custom lowering.
18859 //
18860 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18861 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18862 if (isa<ConstantSDNode>(V.getOperand(0)))
18863 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18864
18865 return SDValue();
18866}
18867
18868static SDValue
18870 SelectionDAG &DAG) {
18871 SDLoc DL(N);
18872 SDValue Vec = N->getOperand(0);
18873 SDValue SubVec = N->getOperand(1);
18874 uint64_t IdxVal = N->getConstantOperandVal(2);
18875 EVT VecVT = Vec.getValueType();
18876 EVT SubVT = SubVec.getValueType();
18877
18878 // Only do this for legal fixed vector types.
18879 if (!VecVT.isFixedLengthVector() ||
18880 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18881 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18882 return SDValue();
18883
18884 // Ignore widening patterns.
18885 if (IdxVal == 0 && Vec.isUndef())
18886 return SDValue();
18887
18888 // Subvector must be half the width and an "aligned" insertion.
18889 unsigned NumSubElts = SubVT.getVectorNumElements();
18890 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18891 (IdxVal != 0 && IdxVal != NumSubElts))
18892 return SDValue();
18893
18894 // Fold insert_subvector -> concat_vectors
18895 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18896 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18897 SDValue Lo, Hi;
18898 if (IdxVal == 0) {
18899 Lo = SubVec;
18900 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18901 DAG.getVectorIdxConstant(NumSubElts, DL));
18902 } else {
18903 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18904 DAG.getVectorIdxConstant(0, DL));
18905 Hi = SubVec;
18906 }
18907 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18908}
18909
18912 SelectionDAG &DAG) {
18913 // Wait until after everything is legalized to try this. That way we have
18914 // legal vector types and such.
18915 if (DCI.isBeforeLegalizeOps())
18916 return SDValue();
18917 // Transform a scalar conversion of a value from a lane extract into a
18918 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18919 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18920 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18921 //
18922 // The second form interacts better with instruction selection and the
18923 // register allocator to avoid cross-class register copies that aren't
18924 // coalescable due to a lane reference.
18925
18926 // Check the operand and see if it originates from a lane extract.
18927 SDValue Op1 = N->getOperand(1);
18929 return SDValue();
18930
18931 // Yep, no additional predication needed. Perform the transform.
18932 SDValue IID = N->getOperand(0);
18933 SDValue Shift = N->getOperand(2);
18934 SDValue Vec = Op1.getOperand(0);
18935 SDValue Lane = Op1.getOperand(1);
18936 EVT ResTy = N->getValueType(0);
18937 EVT VecResTy;
18938 SDLoc DL(N);
18939
18940 // The vector width should be 128 bits by the time we get here, even
18941 // if it started as 64 bits (the extract_vector handling will have
18942 // done so). Bail if it is not.
18943 if (Vec.getValueSizeInBits() != 128)
18944 return SDValue();
18945
18946 if (Vec.getValueType() == MVT::v4i32)
18947 VecResTy = MVT::v4f32;
18948 else if (Vec.getValueType() == MVT::v2i64)
18949 VecResTy = MVT::v2f64;
18950 else
18951 return SDValue();
18952
18953 SDValue Convert =
18954 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18955 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18956}
18957
18958// AArch64 high-vector "long" operations are formed by performing the non-high
18959// version on an extract_subvector of each operand which gets the high half:
18960//
18961// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18962//
18963// However, there are cases which don't have an extract_high explicitly, but
18964// have another operation that can be made compatible with one for free. For
18965// example:
18966//
18967// (dupv64 scalar) --> (extract_high (dup128 scalar))
18968//
18969// This routine does the actual conversion of such DUPs, once outer routines
18970// have determined that everything else is in order.
18971// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18972// similarly here.
18974 MVT VT = N.getSimpleValueType();
18975 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18976 N.getConstantOperandVal(1) == 0)
18977 N = N.getOperand(0);
18978
18979 switch (N.getOpcode()) {
18980 case AArch64ISD::DUP:
18985 case AArch64ISD::MOVI:
18991 break;
18992 default:
18993 // FMOV could be supported, but isn't very useful, as it would only occur
18994 // if you passed a bitcast' floating point immediate to an eligible long
18995 // integer op (addl, smull, ...).
18996 return SDValue();
18997 }
18998
18999 if (!VT.is64BitVector())
19000 return SDValue();
19001
19002 SDLoc DL(N);
19003 unsigned NumElems = VT.getVectorNumElements();
19004 if (N.getValueType().is64BitVector()) {
19005 MVT ElementTy = VT.getVectorElementType();
19006 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19007 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19008 }
19009
19010 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19011 DAG.getConstant(NumElems, DL, MVT::i64));
19012}
19013
19015 if (N.getOpcode() == ISD::BITCAST)
19016 N = N.getOperand(0);
19017 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19018 return false;
19019 if (N.getOperand(0).getValueType().isScalableVector())
19020 return false;
19021 return N.getConstantOperandAPInt(1) ==
19022 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19023}
19024
19025/// Helper structure to keep track of ISD::SET_CC operands.
19030};
19031
19032/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19034 const SDValue *Cmp;
19036};
19037
19038/// Helper structure to keep track of SetCC information.
19042};
19043
19044/// Helper structure to be able to read SetCC information. If set to
19045/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19046/// GenericSetCCInfo.
19050};
19051
19052/// Check whether or not \p Op is a SET_CC operation, either a generic or
19053/// an
19054/// AArch64 lowered one.
19055/// \p SetCCInfo is filled accordingly.
19056/// \post SetCCInfo is meanginfull only when this function returns true.
19057/// \return True when Op is a kind of SET_CC operation.
19059 // If this is a setcc, this is straight forward.
19060 if (Op.getOpcode() == ISD::SETCC) {
19061 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19062 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19063 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19064 SetCCInfo.IsAArch64 = false;
19065 return true;
19066 }
19067 // Otherwise, check if this is a matching csel instruction.
19068 // In other words:
19069 // - csel 1, 0, cc
19070 // - csel 0, 1, !cc
19071 if (Op.getOpcode() != AArch64ISD::CSEL)
19072 return false;
19073 // Set the information about the operands.
19074 // TODO: we want the operands of the Cmp not the csel
19075 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19076 SetCCInfo.IsAArch64 = true;
19077 SetCCInfo.Info.AArch64.CC =
19078 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19079
19080 // Check that the operands matches the constraints:
19081 // (1) Both operands must be constants.
19082 // (2) One must be 1 and the other must be 0.
19083 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19084 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19085
19086 // Check (1).
19087 if (!TValue || !FValue)
19088 return false;
19089
19090 // Check (2).
19091 if (!TValue->isOne()) {
19092 // Update the comparison when we are interested in !cc.
19093 std::swap(TValue, FValue);
19094 SetCCInfo.Info.AArch64.CC =
19096 }
19097 return TValue->isOne() && FValue->isZero();
19098}
19099
19100// Returns true if Op is setcc or zext of setcc.
19101static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19102 if (isSetCC(Op, Info))
19103 return true;
19104 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19105 isSetCC(Op->getOperand(0), Info));
19106}
19107
19108// The folding we want to perform is:
19109// (add x, [zext] (setcc cc ...) )
19110// -->
19111// (csel x, (add x, 1), !cc ...)
19112//
19113// The latter will get matched to a CSINC instruction.
19115 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19116 SDValue LHS = Op->getOperand(0);
19117 SDValue RHS = Op->getOperand(1);
19118 SetCCInfoAndKind InfoAndKind;
19119
19120 // If both operands are a SET_CC, then we don't want to perform this
19121 // folding and create another csel as this results in more instructions
19122 // (and higher register usage).
19123 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19124 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19125 return SDValue();
19126
19127 // If neither operand is a SET_CC, give up.
19128 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19129 std::swap(LHS, RHS);
19130 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19131 return SDValue();
19132 }
19133
19134 // FIXME: This could be generatized to work for FP comparisons.
19135 EVT CmpVT = InfoAndKind.IsAArch64
19136 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19137 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19138 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19139 return SDValue();
19140
19141 SDValue CCVal;
19142 SDValue Cmp;
19143 SDLoc dl(Op);
19144 if (InfoAndKind.IsAArch64) {
19145 CCVal = DAG.getConstant(
19147 MVT::i32);
19148 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19149 } else
19150 Cmp = getAArch64Cmp(
19151 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19152 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19153 dl);
19154
19155 EVT VT = Op->getValueType(0);
19156 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19157 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19158}
19159
19160// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19162 EVT VT = N->getValueType(0);
19163 // Only scalar integer and vector types.
19164 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19165 return SDValue();
19166
19167 SDValue LHS = N->getOperand(0);
19168 SDValue RHS = N->getOperand(1);
19169 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19170 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19171 return SDValue();
19172
19173 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19174 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19175 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19176 return SDValue();
19177
19178 SDValue Op1 = LHS->getOperand(0);
19179 SDValue Op2 = RHS->getOperand(0);
19180 EVT OpVT1 = Op1.getValueType();
19181 EVT OpVT2 = Op2.getValueType();
19182 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19183 Op2.getOpcode() != AArch64ISD::UADDV ||
19184 OpVT1.getVectorElementType() != VT)
19185 return SDValue();
19186
19187 SDValue Val1 = Op1.getOperand(0);
19188 SDValue Val2 = Op2.getOperand(0);
19189 EVT ValVT = Val1->getValueType(0);
19190 SDLoc DL(N);
19191 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19192 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19193 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19194 DAG.getConstant(0, DL, MVT::i64));
19195}
19196
19197/// Perform the scalar expression combine in the form of:
19198/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19199/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19201 EVT VT = N->getValueType(0);
19202 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19203 return SDValue();
19204
19205 SDValue LHS = N->getOperand(0);
19206 SDValue RHS = N->getOperand(1);
19207
19208 // Handle commutivity.
19209 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19210 LHS.getOpcode() != AArch64ISD::CSNEG) {
19211 std::swap(LHS, RHS);
19212 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19213 LHS.getOpcode() != AArch64ISD::CSNEG) {
19214 return SDValue();
19215 }
19216 }
19217
19218 if (!LHS.hasOneUse())
19219 return SDValue();
19220
19221 AArch64CC::CondCode AArch64CC =
19222 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19223
19224 // The CSEL should include a const one operand, and the CSNEG should include
19225 // One or NegOne operand.
19226 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19227 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19228 if (!CTVal || !CFVal)
19229 return SDValue();
19230
19231 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19232 (CTVal->isOne() || CFVal->isOne())) &&
19233 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19234 (CTVal->isOne() || CFVal->isAllOnes())))
19235 return SDValue();
19236
19237 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19238 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19239 !CFVal->isOne()) {
19240 std::swap(CTVal, CFVal);
19241 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19242 }
19243
19244 SDLoc DL(N);
19245 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19246 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19247 !CFVal->isAllOnes()) {
19248 APInt C = -1 * CFVal->getAPIntValue();
19249 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19250 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19251 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19252 }
19253
19254 // It might be neutral for larger constants, as the immediate need to be
19255 // materialized in a register.
19256 APInt ADDC = CTVal->getAPIntValue();
19257 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19258 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19259 return SDValue();
19260
19261 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19262 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19263 "Unexpected constant value");
19264
19265 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19266 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19267 SDValue Cmp = LHS.getOperand(3);
19268
19269 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19270}
19271
19272// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19274 EVT VT = N->getValueType(0);
19275 if (N->getOpcode() != ISD::ADD)
19276 return SDValue();
19277
19278 SDValue Dot = N->getOperand(0);
19279 SDValue A = N->getOperand(1);
19280 // Handle commutivity
19281 auto isZeroDot = [](SDValue Dot) {
19282 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19283 Dot.getOpcode() == AArch64ISD::SDOT) &&
19285 };
19286 if (!isZeroDot(Dot))
19287 std::swap(Dot, A);
19288 if (!isZeroDot(Dot))
19289 return SDValue();
19290
19291 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19292 Dot.getOperand(2));
19293}
19294
19296 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19297}
19298
19300 SDLoc DL(Op);
19301 EVT VT = Op.getValueType();
19302 SDValue Zero = DAG.getConstant(0, DL, VT);
19303 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19304}
19305
19306// Try to fold
19307//
19308// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19309//
19310// The folding helps csel to be matched with csneg without generating
19311// redundant neg instruction, which includes negation of the csel expansion
19312// of abs node lowered by lowerABS.
19314 if (!isNegatedInteger(SDValue(N, 0)))
19315 return SDValue();
19316
19317 SDValue CSel = N->getOperand(1);
19318 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19319 return SDValue();
19320
19321 SDValue N0 = CSel.getOperand(0);
19322 SDValue N1 = CSel.getOperand(1);
19323
19324 // If both of them is not negations, it's not worth the folding as it
19325 // introduces two additional negations while reducing one negation.
19326 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19327 return SDValue();
19328
19329 SDValue N0N = getNegatedInteger(N0, DAG);
19330 SDValue N1N = getNegatedInteger(N1, DAG);
19331
19332 SDLoc DL(N);
19333 EVT VT = CSel.getValueType();
19334 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19335 CSel.getOperand(3));
19336}
19337
19338// The basic add/sub long vector instructions have variants with "2" on the end
19339// which act on the high-half of their inputs. They are normally matched by
19340// patterns like:
19341//
19342// (add (zeroext (extract_high LHS)),
19343// (zeroext (extract_high RHS)))
19344// -> uaddl2 vD, vN, vM
19345//
19346// However, if one of the extracts is something like a duplicate, this
19347// instruction can still be used profitably. This function puts the DAG into a
19348// more appropriate form for those patterns to trigger.
19351 SelectionDAG &DAG = DCI.DAG;
19352 if (DCI.isBeforeLegalizeOps())
19353 return SDValue();
19354
19355 MVT VT = N->getSimpleValueType(0);
19356 if (!VT.is128BitVector()) {
19357 if (N->getOpcode() == ISD::ADD)
19358 return performSetccAddFolding(N, DAG);
19359 return SDValue();
19360 }
19361
19362 // Make sure both branches are extended in the same way.
19363 SDValue LHS = N->getOperand(0);
19364 SDValue RHS = N->getOperand(1);
19365 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19366 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19367 LHS.getOpcode() != RHS.getOpcode())
19368 return SDValue();
19369
19370 unsigned ExtType = LHS.getOpcode();
19371
19372 // It's not worth doing if at least one of the inputs isn't already an
19373 // extract, but we don't know which it'll be so we have to try both.
19374 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19375 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19376 if (!RHS.getNode())
19377 return SDValue();
19378
19379 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19380 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19381 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19382 if (!LHS.getNode())
19383 return SDValue();
19384
19385 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19386 }
19387
19388 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19389}
19390
19391static bool isCMP(SDValue Op) {
19392 return Op.getOpcode() == AArch64ISD::SUBS &&
19393 !Op.getNode()->hasAnyUseOfValue(0);
19394}
19395
19396// (CSEL 1 0 CC Cond) => CC
19397// (CSEL 0 1 CC Cond) => !CC
19398static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19399 if (Op.getOpcode() != AArch64ISD::CSEL)
19400 return std::nullopt;
19401 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19402 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19403 return std::nullopt;
19404 SDValue OpLHS = Op.getOperand(0);
19405 SDValue OpRHS = Op.getOperand(1);
19406 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19407 return CC;
19408 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19409 return getInvertedCondCode(CC);
19410
19411 return std::nullopt;
19412}
19413
19414// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19415// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19416static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19417 SDValue CmpOp = Op->getOperand(2);
19418 if (!isCMP(CmpOp))
19419 return SDValue();
19420
19421 if (IsAdd) {
19422 if (!isOneConstant(CmpOp.getOperand(1)))
19423 return SDValue();
19424 } else {
19425 if (!isNullConstant(CmpOp.getOperand(0)))
19426 return SDValue();
19427 }
19428
19429 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19430 auto CC = getCSETCondCode(CsetOp);
19431 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19432 return SDValue();
19433
19434 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19435 Op->getOperand(0), Op->getOperand(1),
19436 CsetOp.getOperand(3));
19437}
19438
19439// (ADC x 0 cond) => (CINC x HS cond)
19441 SDValue LHS = N->getOperand(0);
19442 SDValue RHS = N->getOperand(1);
19443 SDValue Cond = N->getOperand(2);
19444
19445 if (!isNullConstant(RHS))
19446 return SDValue();
19447
19448 EVT VT = N->getValueType(0);
19449 SDLoc DL(N);
19450
19451 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19452 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19453 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19454}
19455
19456// Transform vector add(zext i8 to i32, zext i8 to i32)
19457// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19458// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19459// extends.
19461 EVT VT = N->getValueType(0);
19462 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19463 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19464 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19465 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19466 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19467 N->getOperand(0).getOperand(0).getValueType() !=
19468 N->getOperand(1).getOperand(0).getValueType())
19469 return SDValue();
19470
19471 SDValue N0 = N->getOperand(0).getOperand(0);
19472 SDValue N1 = N->getOperand(1).getOperand(0);
19473 EVT InVT = N0.getValueType();
19474
19475 EVT S1 = InVT.getScalarType();
19476 EVT S2 = VT.getScalarType();
19477 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19478 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19479 SDLoc DL(N);
19480 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19483 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19484 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19485 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19486 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19487 }
19488 return SDValue();
19489}
19490
19493 SelectionDAG &DAG) {
19494 SDLoc DL(N);
19495 EVT VT = N->getValueType(0);
19496
19497 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19498 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19499 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19500 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19501 Elt1->getOpcode() == ISD::FP_ROUND &&
19502 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19503 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19504 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19506 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19507 // Constant index.
19508 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19509 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19510 Elt0->getOperand(0)->getOperand(0) ==
19511 Elt1->getOperand(0)->getOperand(0) &&
19512 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19513 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19514 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19515 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19516 SDValue HighLanes;
19517 if (Elt2->getOpcode() == ISD::UNDEF &&
19518 Elt3->getOpcode() == ISD::UNDEF) {
19519 HighLanes = DAG.getUNDEF(MVT::v2f32);
19520 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19521 Elt3->getOpcode() == ISD::FP_ROUND &&
19522 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19523 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19524 Elt2->getConstantOperandVal(1) ==
19525 Elt3->getConstantOperandVal(1) &&
19526 Elt2->getOperand(0)->getOpcode() ==
19528 Elt3->getOperand(0)->getOpcode() ==
19530 // Constant index.
19531 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19532 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19533 Elt2->getOperand(0)->getOperand(0) ==
19534 Elt3->getOperand(0)->getOperand(0) &&
19535 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19536 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19537 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19538 HighLanes =
19539 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19540 }
19541 if (HighLanes) {
19542 SDValue DoubleToSingleSticky =
19543 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19544 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19545 DoubleToSingleSticky, HighLanes);
19546 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19547 Elt0->getOperand(1));
19548 }
19549 }
19550 }
19551 }
19552
19553 if (VT == MVT::v2f64) {
19554 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19555 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19556 Elt1->getOpcode() == ISD::FP_EXTEND &&
19558 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19559 Elt0->getOperand(0)->getOperand(0) ==
19560 Elt1->getOperand(0)->getOperand(0) &&
19561 // Constant index.
19562 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19563 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19564 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19565 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19566 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19567 // ResultType's known minimum vector length.
19568 Elt0->getOperand(0)->getConstantOperandVal(1) %
19570 0) {
19571 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19572 if (SrcVec.getValueType() == MVT::v4f16 ||
19573 SrcVec.getValueType() == MVT::v4bf16) {
19574 SDValue HalfToSingle =
19575 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19576 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19577 SDValue Extract = DAG.getNode(
19579 HalfToSingle, SubvectorIdx);
19580 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19581 }
19582 }
19583 }
19584
19585 // A build vector of two extracted elements is equivalent to an
19586 // extract subvector where the inner vector is any-extended to the
19587 // extract_vector_elt VT.
19588 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19589 // (extract_elt_iXX_to_i32 vec Idx+1))
19590 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19591
19592 // For now, only consider the v2i32 case, which arises as a result of
19593 // legalization.
19594 if (VT != MVT::v2i32)
19595 return SDValue();
19596
19597 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19598 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19599 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19600 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19601 // Constant index.
19602 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19603 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19604 // Both EXTRACT_VECTOR_ELT from same vector...
19605 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19606 // ... and contiguous. First element's index +1 == second element's index.
19607 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19608 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19609 // ResultType's known minimum vector length.
19610 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19611 SDValue VecToExtend = Elt0->getOperand(0);
19612 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19613 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19614 return SDValue();
19615
19616 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19617
19618 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19619 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19620 SubvectorIdx);
19621 }
19622
19623 return SDValue();
19624}
19625
19627 SelectionDAG &DAG) {
19628 EVT VT = N->getValueType(0);
19629 SDValue N0 = N->getOperand(0);
19630 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19631 N0.getOpcode() == AArch64ISD::DUP) {
19632 SDValue Op = N0.getOperand(0);
19633 if (VT.getScalarType() == MVT::i32 &&
19634 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19635 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19636 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19637 }
19638
19639 return SDValue();
19640}
19641
19642// Check an node is an extend or shift operand
19644 unsigned Opcode = N.getOpcode();
19645 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19646 EVT SrcVT;
19647 if (Opcode == ISD::SIGN_EXTEND_INREG)
19648 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19649 else
19650 SrcVT = N.getOperand(0).getValueType();
19651
19652 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19653 } else if (Opcode == ISD::AND) {
19654 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19655 if (!CSD)
19656 return false;
19657 uint64_t AndMask = CSD->getZExtValue();
19658 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19659 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19660 return isa<ConstantSDNode>(N.getOperand(1));
19661 }
19662
19663 return false;
19664}
19665
19666// (N - Y) + Z --> (Z - Y) + N
19667// when N is an extend or shift operand
19669 SelectionDAG &DAG) {
19670 auto IsOneUseExtend = [](SDValue N) {
19671 return N.hasOneUse() && isExtendOrShiftOperand(N);
19672 };
19673
19674 // DAGCombiner will revert the combination when Z is constant cause
19675 // dead loop. So don't enable the combination when Z is constant.
19676 // If Z is one use shift C, we also can't do the optimization.
19677 // It will falling to self infinite loop.
19678 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19679 return SDValue();
19680
19681 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19682 return SDValue();
19683
19684 SDValue Shift = SUB.getOperand(0);
19685 if (!IsOneUseExtend(Shift))
19686 return SDValue();
19687
19688 SDLoc DL(N);
19689 EVT VT = N->getValueType(0);
19690
19691 SDValue Y = SUB.getOperand(1);
19692 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19693 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19694}
19695
19697 SelectionDAG &DAG) {
19698 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19699 // commutative.
19700 if (N->getOpcode() != ISD::ADD)
19701 return SDValue();
19702
19703 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19704 // shifted register is only available for i32 and i64.
19705 EVT VT = N->getValueType(0);
19706 if (VT != MVT::i32 && VT != MVT::i64)
19707 return SDValue();
19708
19709 SDLoc DL(N);
19710 SDValue LHS = N->getOperand(0);
19711 SDValue RHS = N->getOperand(1);
19712
19713 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19714 return Val;
19715 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19716 return Val;
19717
19718 uint64_t LHSImm = 0, RHSImm = 0;
19719 // If both operand are shifted by imm and shift amount is not greater than 4
19720 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19721 // on RHS.
19722 //
19723 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19724 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19725 // with LSL (shift > 4). For the rest of processors, this is no-op for
19726 // performance or correctness.
19727 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19728 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19729 RHSImm > 4 && LHS.hasOneUse())
19730 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19731
19732 return SDValue();
19733}
19734
19735// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19736// This reassociates it back to allow the creation of more mls instructions.
19738 if (N->getOpcode() != ISD::SUB)
19739 return SDValue();
19740
19741 SDValue Add = N->getOperand(1);
19742 SDValue X = N->getOperand(0);
19743 if (Add.getOpcode() != ISD::ADD)
19744 return SDValue();
19745
19746 if (!Add.hasOneUse())
19747 return SDValue();
19749 return SDValue();
19750
19751 SDValue M1 = Add.getOperand(0);
19752 SDValue M2 = Add.getOperand(1);
19753 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19754 M1.getOpcode() != AArch64ISD::UMULL)
19755 return SDValue();
19756 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19758 return SDValue();
19759
19760 EVT VT = N->getValueType(0);
19761 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19762 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19763}
19764
19765// Combine into mla/mls.
19766// This works on the patterns of:
19767// add v1, (mul v2, v3)
19768// sub v1, (mul v2, v3)
19769// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19770// It will transform the add/sub to a scalable version, so that we can
19771// make use of SVE's MLA/MLS that will be generated for that pattern
19772static SDValue
19774 SelectionDAG &DAG = DCI.DAG;
19775 // Make sure that the types are legal
19776 if (!DCI.isAfterLegalizeDAG())
19777 return SDValue();
19778 // Before using SVE's features, check first if it's available.
19779 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19780 return SDValue();
19781
19782 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19783 return SDValue();
19784
19785 if (!N->getValueType(0).isFixedLengthVector())
19786 return SDValue();
19787
19788 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19789 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19790 return SDValue();
19791
19792 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19793 return SDValue();
19794
19795 SDValue MulValue = Op1->getOperand(0);
19796 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19797 return SDValue();
19798
19799 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19800 return SDValue();
19801
19802 EVT ScalableVT = MulValue.getValueType();
19803 if (!ScalableVT.isScalableVector())
19804 return SDValue();
19805
19806 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19807 SDValue NewValue =
19808 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19809 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19810 };
19811
19812 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19813 return res;
19814 else if (N->getOpcode() == ISD::ADD)
19815 return performOpt(N->getOperand(1), N->getOperand(0));
19816
19817 return SDValue();
19818}
19819
19820// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19821// help, for example, to produce ssra from sshr+add.
19823 EVT VT = N->getValueType(0);
19824 if (VT != MVT::i64)
19825 return SDValue();
19826 SDValue Op0 = N->getOperand(0);
19827 SDValue Op1 = N->getOperand(1);
19828
19829 // At least one of the operands should be an extract, and the other should be
19830 // something that is easy to convert to v1i64 type (in this case a load).
19831 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19832 Op0.getOpcode() != ISD::LOAD)
19833 return SDValue();
19834 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19835 Op1.getOpcode() != ISD::LOAD)
19836 return SDValue();
19837
19838 SDLoc DL(N);
19839 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19840 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19841 Op0 = Op0.getOperand(0);
19842 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19843 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19844 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19845 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19846 Op1 = Op1.getOperand(0);
19847 } else
19848 return SDValue();
19849
19850 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19851 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19852 DAG.getConstant(0, DL, MVT::i64));
19853}
19854
19857 if (!BV->hasOneUse())
19858 return false;
19859 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19860 if (!Ld || !Ld->isSimple())
19861 return false;
19862 Loads.push_back(Ld);
19863 return true;
19864 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19866 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19867 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19868 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19869 return false;
19870 Loads.push_back(Ld);
19871 }
19872 return true;
19873 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19874 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19875 // are lowered. Note that this only comes up because we do not always visit
19876 // operands before uses. After that is fixed this can be removed and in the
19877 // meantime this is fairly specific to the lowering we expect from IR.
19878 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19879 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19880 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19881 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19882 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19883 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19884 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19885 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19886 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19887 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19888 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19889 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19890 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19891 B.getOperand(1).getNumOperands() != 4)
19892 return false;
19893 auto SV1 = cast<ShuffleVectorSDNode>(B);
19894 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19895 int NumElts = B.getValueType().getVectorNumElements();
19896 int NumSubElts = NumElts / 4;
19897 for (int I = 0; I < NumSubElts; I++) {
19898 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19899 if (SV1->getMaskElt(I) != I ||
19900 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19901 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19902 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19903 return false;
19904 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19905 if (SV2->getMaskElt(I) != I ||
19906 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19907 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19908 return false;
19909 }
19910 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19911 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19912 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19913 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19914 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19915 !Ld2->isSimple() || !Ld3->isSimple())
19916 return false;
19917 Loads.push_back(Ld0);
19918 Loads.push_back(Ld1);
19919 Loads.push_back(Ld2);
19920 Loads.push_back(Ld3);
19921 return true;
19922 }
19923 return false;
19924}
19925
19927 SelectionDAG &DAG,
19928 unsigned &NumSubLoads) {
19929 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19930 return false;
19931
19932 SmallVector<LoadSDNode *> Loads0, Loads1;
19933 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19934 isLoadOrMultipleLoads(Op1, Loads1)) {
19935 if (NumSubLoads && Loads0.size() != NumSubLoads)
19936 return false;
19937 NumSubLoads = Loads0.size();
19938 return Loads0.size() == Loads1.size() &&
19939 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19940 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19941 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19942 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19943 Size / 8, 1);
19944 });
19945 }
19946
19947 if (Op0.getOpcode() != Op1.getOpcode())
19948 return false;
19949
19950 switch (Op0.getOpcode()) {
19951 case ISD::ADD:
19952 case ISD::SUB:
19954 DAG, NumSubLoads) &&
19956 DAG, NumSubLoads);
19957 case ISD::SIGN_EXTEND:
19958 case ISD::ANY_EXTEND:
19959 case ISD::ZERO_EXTEND:
19960 EVT XVT = Op0.getOperand(0).getValueType();
19961 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19962 XVT.getScalarSizeInBits() != 32)
19963 return false;
19965 DAG, NumSubLoads);
19966 }
19967 return false;
19968}
19969
19970// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19971// into a single load of twice the size, that we extract the bottom part and top
19972// part so that the shl can use a shll2 instruction. The two loads in that
19973// example can also be larger trees of instructions, which are identical except
19974// for the leaves which are all loads offset from the LHS, including
19975// buildvectors of multiple loads. For example the RHS tree could be
19976// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19977// Whilst it can be common for the larger loads to replace LDP instructions
19978// (which doesn't gain anything on it's own), the larger loads can help create
19979// more efficient code, and in buildvectors prevent the need for ld1 lane
19980// inserts which can be slower than normal loads.
19982 EVT VT = N->getValueType(0);
19983 if (!VT.isFixedLengthVector() ||
19984 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19985 VT.getScalarSizeInBits() != 64))
19986 return SDValue();
19987
19988 SDValue Other = N->getOperand(0);
19989 SDValue Shift = N->getOperand(1);
19990 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19991 std::swap(Shift, Other);
19992 APInt ShiftAmt;
19993 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19994 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19995 return SDValue();
19996
19997 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19998 !ISD::isExtOpcode(Other.getOpcode()) ||
19999 Shift.getOperand(0).getOperand(0).getValueType() !=
20000 Other.getOperand(0).getValueType() ||
20001 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20002 return SDValue();
20003
20004 SDValue Op0 = Other.getOperand(0);
20005 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20006
20007 unsigned NumSubLoads = 0;
20008 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20009 return SDValue();
20010
20011 // Attempt to rule out some unprofitable cases using heuristics (some working
20012 // around suboptimal code generation), notably if the extend not be able to
20013 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20014 // will need to be created which can increase the instruction count.
20015 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20016 unsigned NumSubElts = NumElts / NumSubLoads;
20017 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20018 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20019 Op0.getValueType().getSizeInBits() < 128 &&
20021 return SDValue();
20022
20023 // Recreate the tree with the new combined loads.
20024 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20025 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20026 EVT DVT =
20028
20029 SmallVector<LoadSDNode *> Loads0, Loads1;
20030 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20031 isLoadOrMultipleLoads(Op1, Loads1)) {
20032 EVT LoadVT = EVT::getVectorVT(
20033 *DAG.getContext(), Op0.getValueType().getScalarType(),
20034 Op0.getValueType().getVectorNumElements() / Loads0.size());
20035 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20036
20037 SmallVector<SDValue> NewLoads;
20038 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20039 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20040 L0->getBasePtr(), L0->getPointerInfo(),
20041 L0->getOriginalAlign());
20042 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20043 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20044 NewLoads.push_back(Load);
20045 }
20046 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20047 }
20048
20050 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20051 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20052 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20053 };
20054 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20055
20056 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20057 int Hi = NumSubElts, Lo = 0;
20058 for (unsigned i = 0; i < NumSubLoads; i++) {
20059 for (unsigned j = 0; j < NumSubElts; j++) {
20060 LowMask[i * NumSubElts + j] = Lo++;
20061 HighMask[i * NumSubElts + j] = Hi++;
20062 }
20063 Lo += NumSubElts;
20064 Hi += NumSubElts;
20065 }
20066 SDLoc DL(N);
20067 SDValue Ext0, Ext1;
20068 // Extract the top and bottom lanes, then extend the result. Possibly extend
20069 // the result then extract the lanes if the two operands match as it produces
20070 // slightly smaller code.
20071 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20073 NewOp, DAG.getConstant(0, DL, MVT::i64));
20074 SDValue SubH =
20075 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20076 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20077 SDValue Extr0 =
20078 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20079 SDValue Extr1 =
20080 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20081 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20082 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20083 } else {
20085 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20086 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20087 DAG.getConstant(0, DL, MVT::i64));
20088 SDValue SubH =
20089 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20090 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20091 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20092 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20093 }
20094 SDValue NShift =
20095 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20096 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20097}
20098
20101 // Try to change sum of two reductions.
20102 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20103 return Val;
20104 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20105 return Val;
20106 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20107 return Val;
20108 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20109 return Val;
20111 return Val;
20113 return Val;
20114 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20115 return Val;
20116 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20117 return Val;
20118 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20119 return Val;
20120
20121 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20122 return Val;
20123
20124 return performAddSubLongCombine(N, DCI);
20125}
20126
20127// Massage DAGs which we can use the high-half "long" operations on into
20128// something isel will recognize better. E.g.
20129//
20130// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20131// (aarch64_neon_umull (extract_high (v2i64 vec)))
20132// (extract_high (v2i64 (dup128 scalar)))))
20133//
20136 SelectionDAG &DAG) {
20137 if (DCI.isBeforeLegalizeOps())
20138 return SDValue();
20139
20140 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20141 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20142 assert(LHS.getValueType().is64BitVector() &&
20143 RHS.getValueType().is64BitVector() &&
20144 "unexpected shape for long operation");
20145
20146 // Either node could be a DUP, but it's not worth doing both of them (you'd
20147 // just as well use the non-high version) so look for a corresponding extract
20148 // operation on the other "wing".
20151 if (!RHS.getNode())
20152 return SDValue();
20155 if (!LHS.getNode())
20156 return SDValue();
20157 } else
20158 return SDValue();
20159
20160 if (IID == Intrinsic::not_intrinsic)
20161 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20162
20163 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20164 N->getOperand(0), LHS, RHS);
20165}
20166
20167static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20168 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20169 unsigned ElemBits = ElemTy.getSizeInBits();
20170
20171 int64_t ShiftAmount;
20172 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20173 APInt SplatValue, SplatUndef;
20174 unsigned SplatBitSize;
20175 bool HasAnyUndefs;
20176 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20177 HasAnyUndefs, ElemBits) ||
20178 SplatBitSize != ElemBits)
20179 return SDValue();
20180
20181 ShiftAmount = SplatValue.getSExtValue();
20182 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20183 ShiftAmount = CVN->getSExtValue();
20184 } else
20185 return SDValue();
20186
20187 // If the shift amount is zero, remove the shift intrinsic.
20188 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20189 return N->getOperand(1);
20190
20191 unsigned Opcode;
20192 bool IsRightShift;
20193 switch (IID) {
20194 default:
20195 llvm_unreachable("Unknown shift intrinsic");
20196 case Intrinsic::aarch64_neon_sqshl:
20197 Opcode = AArch64ISD::SQSHL_I;
20198 IsRightShift = false;
20199 break;
20200 case Intrinsic::aarch64_neon_uqshl:
20201 Opcode = AArch64ISD::UQSHL_I;
20202 IsRightShift = false;
20203 break;
20204 case Intrinsic::aarch64_neon_srshl:
20205 Opcode = AArch64ISD::SRSHR_I;
20206 IsRightShift = true;
20207 break;
20208 case Intrinsic::aarch64_neon_urshl:
20209 Opcode = AArch64ISD::URSHR_I;
20210 IsRightShift = true;
20211 break;
20212 case Intrinsic::aarch64_neon_sqshlu:
20213 Opcode = AArch64ISD::SQSHLU_I;
20214 IsRightShift = false;
20215 break;
20216 case Intrinsic::aarch64_neon_sshl:
20217 case Intrinsic::aarch64_neon_ushl:
20218 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20219 // left shift for positive shift amounts. For negative shifts we can use a
20220 // VASHR/VLSHR as appropiate.
20221 if (ShiftAmount < 0) {
20222 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20224 ShiftAmount = -ShiftAmount;
20225 } else
20226 Opcode = AArch64ISD::VSHL;
20227 IsRightShift = false;
20228 break;
20229 }
20230
20231 EVT VT = N->getValueType(0);
20232 SDValue Op = N->getOperand(1);
20233 SDLoc dl(N);
20234 if (VT == MVT::i64) {
20235 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20236 VT = MVT::v1i64;
20237 }
20238
20239 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20240 Op = DAG.getNode(Opcode, dl, VT, Op,
20241 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20242 if (N->getValueType(0) == MVT::i64)
20243 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20244 DAG.getConstant(0, dl, MVT::i64));
20245 return Op;
20246 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20247 Op = DAG.getNode(Opcode, dl, VT, Op,
20248 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20249 if (N->getValueType(0) == MVT::i64)
20250 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20251 DAG.getConstant(0, dl, MVT::i64));
20252 return Op;
20253 }
20254
20255 return SDValue();
20256}
20257
20258// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20259// the intrinsics must be legal and take an i32, this means there's almost
20260// certainly going to be a zext in the DAG which we can eliminate.
20261static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20262 SDValue AndN = N->getOperand(2);
20263 if (AndN.getOpcode() != ISD::AND)
20264 return SDValue();
20265
20266 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20267 if (!CMask || CMask->getZExtValue() != Mask)
20268 return SDValue();
20269
20270 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20271 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20272}
20273
20275 SelectionDAG &DAG) {
20276 SDLoc dl(N);
20277 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20278 DAG.getNode(Opc, dl,
20279 N->getOperand(1).getSimpleValueType(),
20280 N->getOperand(1)),
20281 DAG.getConstant(0, dl, MVT::i64));
20282}
20283
20285 SDLoc DL(N);
20286 SDValue Op1 = N->getOperand(1);
20287 SDValue Op2 = N->getOperand(2);
20288 EVT ScalarTy = Op2.getValueType();
20289 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20290 ScalarTy = MVT::i32;
20291
20292 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20293 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20294 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20295 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20296 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20297 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20298}
20299
20301 SDLoc dl(N);
20302 SDValue Scalar = N->getOperand(3);
20303 EVT ScalarTy = Scalar.getValueType();
20304
20305 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20306 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20307
20308 SDValue Passthru = N->getOperand(1);
20309 SDValue Pred = N->getOperand(2);
20310 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20311 Pred, Scalar, Passthru);
20312}
20313
20315 SDLoc dl(N);
20316 LLVMContext &Ctx = *DAG.getContext();
20317 EVT VT = N->getValueType(0);
20318
20319 assert(VT.isScalableVector() && "Expected a scalable vector.");
20320
20321 // Current lowering only supports the SVE-ACLE types.
20323 return SDValue();
20324
20325 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20326 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20327 EVT ByteVT =
20328 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20329
20330 // Convert everything to the domain of EXT (i.e bytes).
20331 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20332 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20333 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20334 DAG.getConstant(ElemSize, dl, MVT::i32));
20335
20336 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20337 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20338}
20339
20342 SelectionDAG &DAG) {
20343 if (DCI.isBeforeLegalize())
20344 return SDValue();
20345
20346 SDValue Comparator = N->getOperand(3);
20347 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20348 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20349 unsigned IID = getIntrinsicID(N);
20350 EVT VT = N->getValueType(0);
20351 EVT CmpVT = N->getOperand(2).getValueType();
20352 SDValue Pred = N->getOperand(1);
20353 SDValue Imm;
20354 SDLoc DL(N);
20355
20356 switch (IID) {
20357 default:
20358 llvm_unreachable("Called with wrong intrinsic!");
20359 break;
20360
20361 // Signed comparisons
20362 case Intrinsic::aarch64_sve_cmpeq_wide:
20363 case Intrinsic::aarch64_sve_cmpne_wide:
20364 case Intrinsic::aarch64_sve_cmpge_wide:
20365 case Intrinsic::aarch64_sve_cmpgt_wide:
20366 case Intrinsic::aarch64_sve_cmplt_wide:
20367 case Intrinsic::aarch64_sve_cmple_wide: {
20368 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20369 int64_t ImmVal = CN->getSExtValue();
20370 if (ImmVal >= -16 && ImmVal <= 15)
20371 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20372 else
20373 return SDValue();
20374 }
20375 break;
20376 }
20377 // Unsigned comparisons
20378 case Intrinsic::aarch64_sve_cmphs_wide:
20379 case Intrinsic::aarch64_sve_cmphi_wide:
20380 case Intrinsic::aarch64_sve_cmplo_wide:
20381 case Intrinsic::aarch64_sve_cmpls_wide: {
20382 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20383 uint64_t ImmVal = CN->getZExtValue();
20384 if (ImmVal <= 127)
20385 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20386 else
20387 return SDValue();
20388 }
20389 break;
20390 }
20391 }
20392
20393 if (!Imm)
20394 return SDValue();
20395
20396 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20397 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20398 N->getOperand(2), Splat, DAG.getCondCode(CC));
20399 }
20400
20401 return SDValue();
20402}
20403
20406 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20407
20408 SDLoc DL(Op);
20409 assert(Op.getValueType().isScalableVector() &&
20410 TLI.isTypeLegal(Op.getValueType()) &&
20411 "Expected legal scalable vector type!");
20412 assert(Op.getValueType() == Pg.getValueType() &&
20413 "Expected same type for PTEST operands");
20414
20415 // Ensure target specific opcodes are using legal type.
20416 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20417 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20418 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20419
20420 // Ensure operands have type nxv16i1.
20421 if (Op.getValueType() != MVT::nxv16i1) {
20424 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20425 else
20426 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20427 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20428 }
20429
20430 // Set condition code (CC) flags.
20431 SDValue Test = DAG.getNode(
20433 DL, MVT::Other, Pg, Op);
20434
20435 // Convert CC to integer based on requested condition.
20436 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20437 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20438 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20439 return DAG.getZExtOrTrunc(Res, DL, VT);
20440}
20441
20443 SelectionDAG &DAG) {
20444 SDLoc DL(N);
20445
20446 SDValue Pred = N->getOperand(1);
20447 SDValue VecToReduce = N->getOperand(2);
20448
20449 // NOTE: The integer reduction's result type is not always linked to the
20450 // operand's element type so we construct it from the intrinsic's result type.
20451 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20452 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20453
20454 // SVE reductions set the whole vector register with the first element
20455 // containing the reduction result, which we'll now extract.
20456 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20457 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20458 Zero);
20459}
20460
20462 SelectionDAG &DAG) {
20463 SDLoc DL(N);
20464
20465 SDValue Pred = N->getOperand(1);
20466 SDValue VecToReduce = N->getOperand(2);
20467
20468 EVT ReduceVT = VecToReduce.getValueType();
20469 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20470
20471 // SVE reductions set the whole vector register with the first element
20472 // containing the reduction result, which we'll now extract.
20473 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20474 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20475 Zero);
20476}
20477
20479 SelectionDAG &DAG) {
20480 SDLoc DL(N);
20481
20482 SDValue Pred = N->getOperand(1);
20483 SDValue InitVal = N->getOperand(2);
20484 SDValue VecToReduce = N->getOperand(3);
20485 EVT ReduceVT = VecToReduce.getValueType();
20486
20487 // Ordered reductions use the first lane of the result vector as the
20488 // reduction's initial value.
20489 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20490 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20491 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20492
20493 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20494
20495 // SVE reductions set the whole vector register with the first element
20496 // containing the reduction result, which we'll now extract.
20497 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20498 Zero);
20499}
20500
20501// If a merged operation has no inactive lanes we can relax it to a predicated
20502// or unpredicated operation, which potentially allows better isel (perhaps
20503// using immediate forms) or relaxing register reuse requirements.
20505 SelectionDAG &DAG, bool UnpredOp = false,
20506 bool SwapOperands = false) {
20507 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20508 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20509 SDValue Pg = N->getOperand(1);
20510 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20511 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20512
20513 // ISD way to specify an all active predicate.
20514 if (isAllActivePredicate(DAG, Pg)) {
20515 if (UnpredOp)
20516 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20517
20518 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20519 }
20520
20521 // FUTURE: SplatVector(true)
20522 return SDValue();
20523}
20524
20527 const AArch64Subtarget *Subtarget) {
20528 SelectionDAG &DAG = DCI.DAG;
20529 unsigned IID = getIntrinsicID(N);
20530 switch (IID) {
20531 default:
20532 break;
20533 case Intrinsic::get_active_lane_mask: {
20534 SDValue Res = SDValue();
20535 EVT VT = N->getValueType(0);
20536 if (VT.isFixedLengthVector()) {
20537 // We can use the SVE whilelo instruction to lower this intrinsic by
20538 // creating the appropriate sequence of scalable vector operations and
20539 // then extracting a fixed-width subvector from the scalable vector.
20540
20541 SDLoc DL(N);
20542 SDValue ID =
20543 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20544
20545 EVT WhileVT = EVT::getVectorVT(
20546 *DAG.getContext(), MVT::i1,
20548
20549 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20550 EVT PromVT = getPromotedVTForPredicate(WhileVT);
20551
20552 // Get the fixed-width equivalent of PromVT for extraction.
20553 EVT ExtVT =
20556
20557 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20558 N->getOperand(1), N->getOperand(2));
20559 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20560 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20561 DAG.getConstant(0, DL, MVT::i64));
20562 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20563 }
20564 return Res;
20565 }
20566 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20567 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20568 return tryCombineFixedPointConvert(N, DCI, DAG);
20569 case Intrinsic::aarch64_neon_saddv:
20571 case Intrinsic::aarch64_neon_uaddv:
20573 case Intrinsic::aarch64_neon_sminv:
20575 case Intrinsic::aarch64_neon_uminv:
20577 case Intrinsic::aarch64_neon_smaxv:
20579 case Intrinsic::aarch64_neon_umaxv:
20581 case Intrinsic::aarch64_neon_fmax:
20582 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20583 N->getOperand(1), N->getOperand(2));
20584 case Intrinsic::aarch64_neon_fmin:
20585 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20586 N->getOperand(1), N->getOperand(2));
20587 case Intrinsic::aarch64_neon_fmaxnm:
20588 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20589 N->getOperand(1), N->getOperand(2));
20590 case Intrinsic::aarch64_neon_fminnm:
20591 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20592 N->getOperand(1), N->getOperand(2));
20593 case Intrinsic::aarch64_neon_smull:
20594 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20595 N->getOperand(1), N->getOperand(2));
20596 case Intrinsic::aarch64_neon_umull:
20597 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20598 N->getOperand(1), N->getOperand(2));
20599 case Intrinsic::aarch64_neon_pmull:
20600 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20601 N->getOperand(1), N->getOperand(2));
20602 case Intrinsic::aarch64_neon_sqdmull:
20603 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20604 case Intrinsic::aarch64_neon_sqshl:
20605 case Intrinsic::aarch64_neon_uqshl:
20606 case Intrinsic::aarch64_neon_sqshlu:
20607 case Intrinsic::aarch64_neon_srshl:
20608 case Intrinsic::aarch64_neon_urshl:
20609 case Intrinsic::aarch64_neon_sshl:
20610 case Intrinsic::aarch64_neon_ushl:
20611 return tryCombineShiftImm(IID, N, DAG);
20612 case Intrinsic::aarch64_neon_sabd:
20613 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20614 N->getOperand(1), N->getOperand(2));
20615 case Intrinsic::aarch64_neon_uabd:
20616 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20617 N->getOperand(1), N->getOperand(2));
20618 case Intrinsic::aarch64_crc32b:
20619 case Intrinsic::aarch64_crc32cb:
20620 return tryCombineCRC32(0xff, N, DAG);
20621 case Intrinsic::aarch64_crc32h:
20622 case Intrinsic::aarch64_crc32ch:
20623 return tryCombineCRC32(0xffff, N, DAG);
20624 case Intrinsic::aarch64_sve_saddv:
20625 // There is no i64 version of SADDV because the sign is irrelevant.
20626 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20628 else
20630 case Intrinsic::aarch64_sve_uaddv:
20632 case Intrinsic::aarch64_sve_smaxv:
20634 case Intrinsic::aarch64_sve_umaxv:
20636 case Intrinsic::aarch64_sve_sminv:
20638 case Intrinsic::aarch64_sve_uminv:
20640 case Intrinsic::aarch64_sve_orv:
20642 case Intrinsic::aarch64_sve_eorv:
20644 case Intrinsic::aarch64_sve_andv:
20646 case Intrinsic::aarch64_sve_index:
20647 return LowerSVEIntrinsicIndex(N, DAG);
20648 case Intrinsic::aarch64_sve_dup:
20649 return LowerSVEIntrinsicDUP(N, DAG);
20650 case Intrinsic::aarch64_sve_dup_x:
20651 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20652 N->getOperand(1));
20653 case Intrinsic::aarch64_sve_ext:
20654 return LowerSVEIntrinsicEXT(N, DAG);
20655 case Intrinsic::aarch64_sve_mul_u:
20656 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20657 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20658 case Intrinsic::aarch64_sve_smulh_u:
20659 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20660 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20661 case Intrinsic::aarch64_sve_umulh_u:
20662 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20663 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20664 case Intrinsic::aarch64_sve_smin_u:
20665 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20666 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20667 case Intrinsic::aarch64_sve_umin_u:
20668 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20669 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20670 case Intrinsic::aarch64_sve_smax_u:
20671 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20672 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20673 case Intrinsic::aarch64_sve_umax_u:
20674 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20675 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20676 case Intrinsic::aarch64_sve_lsl_u:
20677 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20678 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20679 case Intrinsic::aarch64_sve_lsr_u:
20680 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20681 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20682 case Intrinsic::aarch64_sve_asr_u:
20683 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20684 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20685 case Intrinsic::aarch64_sve_fadd_u:
20686 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20687 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20688 case Intrinsic::aarch64_sve_fdiv_u:
20689 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20690 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20691 case Intrinsic::aarch64_sve_fmax_u:
20692 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20693 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20694 case Intrinsic::aarch64_sve_fmaxnm_u:
20695 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20696 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20697 case Intrinsic::aarch64_sve_fmla_u:
20698 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20699 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20700 N->getOperand(2));
20701 case Intrinsic::aarch64_sve_fmin_u:
20702 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20703 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20704 case Intrinsic::aarch64_sve_fminnm_u:
20705 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20706 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20707 case Intrinsic::aarch64_sve_fmul_u:
20708 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20709 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20710 case Intrinsic::aarch64_sve_fsub_u:
20711 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20712 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20713 case Intrinsic::aarch64_sve_add_u:
20714 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20715 N->getOperand(3));
20716 case Intrinsic::aarch64_sve_sub_u:
20717 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20718 N->getOperand(3));
20719 case Intrinsic::aarch64_sve_subr:
20720 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20721 case Intrinsic::aarch64_sve_and_u:
20722 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20723 N->getOperand(3));
20724 case Intrinsic::aarch64_sve_bic_u:
20725 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20726 N->getOperand(2), N->getOperand(3));
20727 case Intrinsic::aarch64_sve_eor_u:
20728 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20729 N->getOperand(3));
20730 case Intrinsic::aarch64_sve_orr_u:
20731 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20732 N->getOperand(3));
20733 case Intrinsic::aarch64_sve_sabd_u:
20734 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20735 N->getOperand(2), N->getOperand(3));
20736 case Intrinsic::aarch64_sve_uabd_u:
20737 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20738 N->getOperand(2), N->getOperand(3));
20739 case Intrinsic::aarch64_sve_sdiv_u:
20740 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20741 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20742 case Intrinsic::aarch64_sve_udiv_u:
20743 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20744 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20745 case Intrinsic::aarch64_sve_sqadd:
20746 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20747 case Intrinsic::aarch64_sve_sqsub_u:
20748 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20749 N->getOperand(2), N->getOperand(3));
20750 case Intrinsic::aarch64_sve_uqadd:
20751 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20752 case Intrinsic::aarch64_sve_uqsub_u:
20753 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20754 N->getOperand(2), N->getOperand(3));
20755 case Intrinsic::aarch64_sve_sqadd_x:
20756 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20757 N->getOperand(1), N->getOperand(2));
20758 case Intrinsic::aarch64_sve_sqsub_x:
20759 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20760 N->getOperand(1), N->getOperand(2));
20761 case Intrinsic::aarch64_sve_uqadd_x:
20762 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20763 N->getOperand(1), N->getOperand(2));
20764 case Intrinsic::aarch64_sve_uqsub_x:
20765 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20766 N->getOperand(1), N->getOperand(2));
20767 case Intrinsic::aarch64_sve_asrd:
20768 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20769 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20770 case Intrinsic::aarch64_sve_cmphs:
20771 if (!N->getOperand(2).getValueType().isFloatingPoint())
20773 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20774 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20775 break;
20776 case Intrinsic::aarch64_sve_cmphi:
20777 if (!N->getOperand(2).getValueType().isFloatingPoint())
20779 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20780 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20781 break;
20782 case Intrinsic::aarch64_sve_fcmpge:
20783 case Intrinsic::aarch64_sve_cmpge:
20785 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20786 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20787 break;
20788 case Intrinsic::aarch64_sve_fcmpgt:
20789 case Intrinsic::aarch64_sve_cmpgt:
20791 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20792 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20793 break;
20794 case Intrinsic::aarch64_sve_fcmpeq:
20795 case Intrinsic::aarch64_sve_cmpeq:
20797 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20798 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20799 break;
20800 case Intrinsic::aarch64_sve_fcmpne:
20801 case Intrinsic::aarch64_sve_cmpne:
20803 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20804 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20805 break;
20806 case Intrinsic::aarch64_sve_fcmpuo:
20808 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20809 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20810 break;
20811 case Intrinsic::aarch64_sve_fadda:
20813 case Intrinsic::aarch64_sve_faddv:
20815 case Intrinsic::aarch64_sve_fmaxnmv:
20817 case Intrinsic::aarch64_sve_fmaxv:
20819 case Intrinsic::aarch64_sve_fminnmv:
20821 case Intrinsic::aarch64_sve_fminv:
20823 case Intrinsic::aarch64_sve_sel:
20824 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20825 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20826 case Intrinsic::aarch64_sve_cmpeq_wide:
20827 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20828 case Intrinsic::aarch64_sve_cmpne_wide:
20829 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20830 case Intrinsic::aarch64_sve_cmpge_wide:
20831 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20832 case Intrinsic::aarch64_sve_cmpgt_wide:
20833 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20834 case Intrinsic::aarch64_sve_cmplt_wide:
20835 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20836 case Intrinsic::aarch64_sve_cmple_wide:
20837 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20838 case Intrinsic::aarch64_sve_cmphs_wide:
20839 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20840 case Intrinsic::aarch64_sve_cmphi_wide:
20841 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20842 case Intrinsic::aarch64_sve_cmplo_wide:
20843 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20844 case Intrinsic::aarch64_sve_cmpls_wide:
20845 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20846 case Intrinsic::aarch64_sve_ptest_any:
20847 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20849 case Intrinsic::aarch64_sve_ptest_first:
20850 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20852 case Intrinsic::aarch64_sve_ptest_last:
20853 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20855 }
20856 return SDValue();
20857}
20858
20859static bool isCheapToExtend(const SDValue &N) {
20860 unsigned OC = N->getOpcode();
20861 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20863}
20864
20865static SDValue
20867 SelectionDAG &DAG) {
20868 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20869 // we can move the sext into the arguments and have the same result. For
20870 // example, if A and B are both loads, we can make those extending loads and
20871 // avoid an extra instruction. This pattern appears often in VLS code
20872 // generation where the inputs to the setcc have a different size to the
20873 // instruction that wants to use the result of the setcc.
20874 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20875 N->getOperand(0)->getOpcode() == ISD::SETCC);
20876 const SDValue SetCC = N->getOperand(0);
20877
20878 const SDValue CCOp0 = SetCC.getOperand(0);
20879 const SDValue CCOp1 = SetCC.getOperand(1);
20880 if (!CCOp0->getValueType(0).isInteger() ||
20881 !CCOp1->getValueType(0).isInteger())
20882 return SDValue();
20883
20884 ISD::CondCode Code =
20885 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20886
20887 ISD::NodeType ExtType =
20888 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20889
20890 if (isCheapToExtend(SetCC.getOperand(0)) &&
20891 isCheapToExtend(SetCC.getOperand(1))) {
20892 const SDValue Ext1 =
20893 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20894 const SDValue Ext2 =
20895 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20896
20897 return DAG.getSetCC(
20898 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20899 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20900 }
20901
20902 return SDValue();
20903}
20904
20907 SelectionDAG &DAG) {
20908 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20909 // we can convert that DUP into another extract_high (of a bigger DUP), which
20910 // helps the backend to decide that an sabdl2 would be useful, saving a real
20911 // extract_high operation.
20912 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20913 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20914 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20915 SDNode *ABDNode = N->getOperand(0).getNode();
20916 SDValue NewABD =
20918 if (!NewABD.getNode())
20919 return SDValue();
20920
20921 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20922 }
20923
20924 if (N->getValueType(0).isFixedLengthVector() &&
20925 N->getOpcode() == ISD::SIGN_EXTEND &&
20926 N->getOperand(0)->getOpcode() == ISD::SETCC)
20927 return performSignExtendSetCCCombine(N, DCI, DAG);
20928
20929 return SDValue();
20930}
20931
20933 SDValue SplatVal, unsigned NumVecElts) {
20934 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20935 Align OrigAlignment = St.getAlign();
20936 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20937
20938 // Create scalar stores. This is at least as good as the code sequence for a
20939 // split unaligned store which is a dup.s, ext.b, and two stores.
20940 // Most of the time the three stores should be replaced by store pair
20941 // instructions (stp).
20942 SDLoc DL(&St);
20943 SDValue BasePtr = St.getBasePtr();
20944 uint64_t BaseOffset = 0;
20945
20946 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20947 SDValue NewST1 =
20948 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20949 OrigAlignment, St.getMemOperand()->getFlags());
20950
20951 // As this in ISel, we will not merge this add which may degrade results.
20952 if (BasePtr->getOpcode() == ISD::ADD &&
20953 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20954 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20955 BasePtr = BasePtr->getOperand(0);
20956 }
20957
20958 unsigned Offset = EltOffset;
20959 while (--NumVecElts) {
20960 Align Alignment = commonAlignment(OrigAlignment, Offset);
20961 SDValue OffsetPtr =
20962 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20963 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20964 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20965 PtrInfo.getWithOffset(Offset), Alignment,
20966 St.getMemOperand()->getFlags());
20967 Offset += EltOffset;
20968 }
20969 return NewST1;
20970}
20971
20972// Returns an SVE type that ContentTy can be trivially sign or zero extended
20973// into.
20974static MVT getSVEContainerType(EVT ContentTy) {
20975 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20976
20977 switch (ContentTy.getSimpleVT().SimpleTy) {
20978 default:
20979 llvm_unreachable("No known SVE container for this MVT type");
20980 case MVT::nxv2i8:
20981 case MVT::nxv2i16:
20982 case MVT::nxv2i32:
20983 case MVT::nxv2i64:
20984 case MVT::nxv2f32:
20985 case MVT::nxv2f64:
20986 return MVT::nxv2i64;
20987 case MVT::nxv4i8:
20988 case MVT::nxv4i16:
20989 case MVT::nxv4i32:
20990 case MVT::nxv4f32:
20991 return MVT::nxv4i32;
20992 case MVT::nxv8i8:
20993 case MVT::nxv8i16:
20994 case MVT::nxv8f16:
20995 case MVT::nxv8bf16:
20996 return MVT::nxv8i16;
20997 case MVT::nxv16i8:
20998 return MVT::nxv16i8;
20999 }
21000}
21001
21002static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21003 SDLoc DL(N);
21004 EVT VT = N->getValueType(0);
21005
21007 return SDValue();
21008
21009 EVT ContainerVT = VT;
21010 if (ContainerVT.isInteger())
21011 ContainerVT = getSVEContainerType(ContainerVT);
21012
21013 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21014 SDValue Ops[] = { N->getOperand(0), // Chain
21015 N->getOperand(2), // Pg
21016 N->getOperand(3), // Base
21017 DAG.getValueType(VT) };
21018
21019 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21020 SDValue LoadChain = SDValue(Load.getNode(), 1);
21021
21022 if (ContainerVT.isInteger() && (VT != ContainerVT))
21023 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21024
21025 return DAG.getMergeValues({ Load, LoadChain }, DL);
21026}
21027
21029 SDLoc DL(N);
21030 EVT VT = N->getValueType(0);
21031 EVT PtrTy = N->getOperand(3).getValueType();
21032
21033 EVT LoadVT = VT;
21034 if (VT.isFloatingPoint())
21035 LoadVT = VT.changeTypeToInteger();
21036
21037 auto *MINode = cast<MemIntrinsicSDNode>(N);
21038 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21039 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21040 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21041 MINode->getOperand(2), PassThru,
21042 MINode->getMemoryVT(), MINode->getMemOperand(),
21044
21045 if (VT.isFloatingPoint()) {
21046 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21047 return DAG.getMergeValues(Ops, DL);
21048 }
21049
21050 return L;
21051}
21052
21053template <unsigned Opcode>
21055 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21057 "Unsupported opcode.");
21058 SDLoc DL(N);
21059 EVT VT = N->getValueType(0);
21060
21061 EVT LoadVT = VT;
21062 if (VT.isFloatingPoint())
21063 LoadVT = VT.changeTypeToInteger();
21064
21065 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21066 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21067 SDValue LoadChain = SDValue(Load.getNode(), 1);
21068
21069 if (VT.isFloatingPoint())
21070 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21071
21072 return DAG.getMergeValues({Load, LoadChain}, DL);
21073}
21074
21076 SDLoc DL(N);
21077 SDValue Data = N->getOperand(2);
21078 EVT DataVT = Data.getValueType();
21079 EVT HwSrcVt = getSVEContainerType(DataVT);
21080 SDValue InputVT = DAG.getValueType(DataVT);
21081
21082 if (DataVT.isFloatingPoint())
21083 InputVT = DAG.getValueType(HwSrcVt);
21084
21085 SDValue SrcNew;
21086 if (Data.getValueType().isFloatingPoint())
21087 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21088 else
21089 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21090
21091 SDValue Ops[] = { N->getOperand(0), // Chain
21092 SrcNew,
21093 N->getOperand(4), // Base
21094 N->getOperand(3), // Pg
21095 InputVT
21096 };
21097
21098 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21099}
21100
21102 SDLoc DL(N);
21103
21104 SDValue Data = N->getOperand(2);
21105 EVT DataVT = Data.getValueType();
21106 EVT PtrTy = N->getOperand(4).getValueType();
21107
21108 if (DataVT.isFloatingPoint())
21110
21111 auto *MINode = cast<MemIntrinsicSDNode>(N);
21112 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21113 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21114 MINode->getMemoryVT(), MINode->getMemOperand(),
21115 ISD::UNINDEXED, false, false);
21116}
21117
21118/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21119/// load store optimizer pass will merge them to store pair stores. This should
21120/// be better than a movi to create the vector zero followed by a vector store
21121/// if the zero constant is not re-used, since one instructions and one register
21122/// live range will be removed.
21123///
21124/// For example, the final generated code should be:
21125///
21126/// stp xzr, xzr, [x0]
21127///
21128/// instead of:
21129///
21130/// movi v0.2d, #0
21131/// str q0, [x0]
21132///
21134 SDValue StVal = St.getValue();
21135 EVT VT = StVal.getValueType();
21136
21137 // Avoid scalarizing zero splat stores for scalable vectors.
21138 if (VT.isScalableVector())
21139 return SDValue();
21140
21141 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21142 // 2, 3 or 4 i32 elements.
21143 int NumVecElts = VT.getVectorNumElements();
21144 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21145 VT.getVectorElementType().getSizeInBits() == 64) ||
21146 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21147 VT.getVectorElementType().getSizeInBits() == 32)))
21148 return SDValue();
21149
21150 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21151 return SDValue();
21152
21153 // If the zero constant has more than one use then the vector store could be
21154 // better since the constant mov will be amortized and stp q instructions
21155 // should be able to be formed.
21156 if (!StVal.hasOneUse())
21157 return SDValue();
21158
21159 // If the store is truncating then it's going down to i16 or smaller, which
21160 // means it can be implemented in a single store anyway.
21161 if (St.isTruncatingStore())
21162 return SDValue();
21163
21164 // If the immediate offset of the address operand is too large for the stp
21165 // instruction, then bail out.
21166 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21167 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21168 if (Offset < -512 || Offset > 504)
21169 return SDValue();
21170 }
21171
21172 for (int I = 0; I < NumVecElts; ++I) {
21173 SDValue EltVal = StVal.getOperand(I);
21174 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21175 return SDValue();
21176 }
21177
21178 // Use a CopyFromReg WZR/XZR here to prevent
21179 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21180 SDLoc DL(&St);
21181 unsigned ZeroReg;
21182 EVT ZeroVT;
21183 if (VT.getVectorElementType().getSizeInBits() == 32) {
21184 ZeroReg = AArch64::WZR;
21185 ZeroVT = MVT::i32;
21186 } else {
21187 ZeroReg = AArch64::XZR;
21188 ZeroVT = MVT::i64;
21189 }
21190 SDValue SplatVal =
21191 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21192 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21193}
21194
21195/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21196/// value. The load store optimizer pass will merge them to store pair stores.
21197/// This has better performance than a splat of the scalar followed by a split
21198/// vector store. Even if the stores are not merged it is four stores vs a dup,
21199/// followed by an ext.b and two stores.
21201 SDValue StVal = St.getValue();
21202 EVT VT = StVal.getValueType();
21203
21204 // Don't replace floating point stores, they possibly won't be transformed to
21205 // stp because of the store pair suppress pass.
21206 if (VT.isFloatingPoint())
21207 return SDValue();
21208
21209 // We can express a splat as store pair(s) for 2 or 4 elements.
21210 unsigned NumVecElts = VT.getVectorNumElements();
21211 if (NumVecElts != 4 && NumVecElts != 2)
21212 return SDValue();
21213
21214 // If the store is truncating then it's going down to i16 or smaller, which
21215 // means it can be implemented in a single store anyway.
21216 if (St.isTruncatingStore())
21217 return SDValue();
21218
21219 // Check that this is a splat.
21220 // Make sure that each of the relevant vector element locations are inserted
21221 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21222 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21223 SDValue SplatVal;
21224 for (unsigned I = 0; I < NumVecElts; ++I) {
21225 // Check for insert vector elements.
21226 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21227 return SDValue();
21228
21229 // Check that same value is inserted at each vector element.
21230 if (I == 0)
21231 SplatVal = StVal.getOperand(1);
21232 else if (StVal.getOperand(1) != SplatVal)
21233 return SDValue();
21234
21235 // Check insert element index.
21236 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21237 if (!CIndex)
21238 return SDValue();
21239 uint64_t IndexVal = CIndex->getZExtValue();
21240 if (IndexVal >= NumVecElts)
21241 return SDValue();
21242 IndexNotInserted.reset(IndexVal);
21243
21244 StVal = StVal.getOperand(0);
21245 }
21246 // Check that all vector element locations were inserted to.
21247 if (IndexNotInserted.any())
21248 return SDValue();
21249
21250 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21251}
21252
21254 SelectionDAG &DAG,
21255 const AArch64Subtarget *Subtarget) {
21256
21257 StoreSDNode *S = cast<StoreSDNode>(N);
21258 if (S->isVolatile() || S->isIndexed())
21259 return SDValue();
21260
21261 SDValue StVal = S->getValue();
21262 EVT VT = StVal.getValueType();
21263
21264 if (!VT.isFixedLengthVector())
21265 return SDValue();
21266
21267 // If we get a splat of zeros, convert this vector store to a store of
21268 // scalars. They will be merged into store pairs of xzr thereby removing one
21269 // instruction and one register.
21270 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21271 return ReplacedZeroSplat;
21272
21273 // FIXME: The logic for deciding if an unaligned store should be split should
21274 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21275 // a call to that function here.
21276
21277 if (!Subtarget->isMisaligned128StoreSlow())
21278 return SDValue();
21279
21280 // Don't split at -Oz.
21282 return SDValue();
21283
21284 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21285 // those up regresses performance on micro-benchmarks and olden/bh.
21286 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21287 return SDValue();
21288
21289 // Split unaligned 16B stores. They are terrible for performance.
21290 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21291 // extensions can use this to mark that it does not want splitting to happen
21292 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21293 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21294 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21295 S->getAlign() <= Align(2))
21296 return SDValue();
21297
21298 // If we get a splat of a scalar convert this vector store to a store of
21299 // scalars. They will be merged into store pairs thereby removing two
21300 // instructions.
21301 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21302 return ReplacedSplat;
21303
21304 SDLoc DL(S);
21305
21306 // Split VT into two.
21307 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21308 unsigned NumElts = HalfVT.getVectorNumElements();
21309 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21310 DAG.getConstant(0, DL, MVT::i64));
21311 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21312 DAG.getConstant(NumElts, DL, MVT::i64));
21313 SDValue BasePtr = S->getBasePtr();
21314 SDValue NewST1 =
21315 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21316 S->getAlign(), S->getMemOperand()->getFlags());
21317 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21318 DAG.getConstant(8, DL, MVT::i64));
21319 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21320 S->getPointerInfo(), S->getAlign(),
21321 S->getMemOperand()->getFlags());
21322}
21323
21325 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21326
21327 // splice(pg, op1, undef) -> op1
21328 if (N->getOperand(2).isUndef())
21329 return N->getOperand(1);
21330
21331 return SDValue();
21332}
21333
21335 const AArch64Subtarget *Subtarget) {
21336 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21337 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21338 "Unexpected Opcode!");
21339
21340 // uunpklo/hi undef -> undef
21341 if (N->getOperand(0).isUndef())
21342 return DAG.getUNDEF(N->getValueType(0));
21343
21344 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21345 // extending load. We can do this even if this is already a masked
21346 // {z,}extload.
21347 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21348 N->getOpcode() == AArch64ISD::UUNPKLO) {
21349 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21350 SDValue Mask = MLD->getMask();
21351 SDLoc DL(N);
21352
21353 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21354 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21355 (MLD->getPassThru()->isUndef() ||
21356 isZerosVector(MLD->getPassThru().getNode()))) {
21357 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21358 unsigned PgPattern = Mask->getConstantOperandVal(0);
21359 EVT VT = N->getValueType(0);
21360
21361 // Ensure we can double the size of the predicate pattern
21362 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21363 if (NumElts &&
21364 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21365 Mask =
21366 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21367 SDValue PassThru = DAG.getConstant(0, DL, VT);
21368 SDValue NewLoad = DAG.getMaskedLoad(
21369 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21370 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21372
21373 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21374
21375 return NewLoad;
21376 }
21377 }
21378 }
21379
21380 return SDValue();
21381}
21382
21384 if (N->getOpcode() != AArch64ISD::UZP1)
21385 return false;
21386 SDValue Op0 = N->getOperand(0);
21387 EVT SrcVT = Op0->getValueType(0);
21388 EVT DstVT = N->getValueType(0);
21389 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21390 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21391 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21392}
21393
21394// Try to combine rounding shifts where the operands come from an extend, and
21395// the result is truncated and combined into one vector.
21396// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21398 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21399 SDValue Op0 = N->getOperand(0);
21400 SDValue Op1 = N->getOperand(1);
21401 EVT ResVT = N->getValueType(0);
21402
21403 unsigned RshOpc = Op0.getOpcode();
21404 if (RshOpc != AArch64ISD::RSHRNB_I)
21405 return SDValue();
21406
21407 // Same op code and imm value?
21408 SDValue ShiftValue = Op0.getOperand(1);
21409 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21410 return SDValue();
21411
21412 // Same unextended operand value?
21413 SDValue Lo = Op0.getOperand(0);
21414 SDValue Hi = Op1.getOperand(0);
21415 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21416 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21417 return SDValue();
21418 SDValue OrigArg = Lo.getOperand(0);
21419 if (OrigArg != Hi.getOperand(0))
21420 return SDValue();
21421
21422 SDLoc DL(N);
21423 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21424 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21425 ShiftValue);
21426}
21427
21428// Try to simplify:
21429// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21430// t2 = nxv8i16 srl(t1, ShiftValue)
21431// to
21432// t1 = nxv8i16 rshrnb(X, shiftvalue).
21433// rshrnb will zero the top half bits of each element. Therefore, this combine
21434// should only be performed when a following instruction with the rshrnb
21435// as an operand does not care about the top half of each element. For example,
21436// a uzp1 or a truncating store.
21438 const AArch64Subtarget *Subtarget) {
21439 EVT VT = Srl->getValueType(0);
21440 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21441 return SDValue();
21442
21443 EVT ResVT;
21444 if (VT == MVT::nxv8i16)
21445 ResVT = MVT::nxv16i8;
21446 else if (VT == MVT::nxv4i32)
21447 ResVT = MVT::nxv8i16;
21448 else if (VT == MVT::nxv2i64)
21449 ResVT = MVT::nxv4i32;
21450 else
21451 return SDValue();
21452
21453 SDLoc DL(Srl);
21454 unsigned ShiftValue;
21455 SDValue RShOperand;
21456 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21457 return SDValue();
21458 SDValue Rshrnb = DAG.getNode(
21459 AArch64ISD::RSHRNB_I, DL, ResVT,
21460 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21461 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21462}
21463
21465 const AArch64Subtarget *Subtarget) {
21466 SDLoc DL(N);
21467 SDValue Op0 = N->getOperand(0);
21468 SDValue Op1 = N->getOperand(1);
21469 EVT ResVT = N->getValueType(0);
21470
21471 // uzp1(x, undef) -> concat(truncate(x), undef)
21472 if (Op1.getOpcode() == ISD::UNDEF) {
21473 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21474 switch (ResVT.getSimpleVT().SimpleTy) {
21475 default:
21476 break;
21477 case MVT::v16i8:
21478 BCVT = MVT::v8i16;
21479 HalfVT = MVT::v8i8;
21480 break;
21481 case MVT::v8i16:
21482 BCVT = MVT::v4i32;
21483 HalfVT = MVT::v4i16;
21484 break;
21485 case MVT::v4i32:
21486 BCVT = MVT::v2i64;
21487 HalfVT = MVT::v2i32;
21488 break;
21489 }
21490 if (BCVT != MVT::Other) {
21491 SDValue BC = DAG.getBitcast(BCVT, Op0);
21492 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21493 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21494 DAG.getUNDEF(HalfVT));
21495 }
21496 }
21497
21498 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21499 return Urshr;
21500
21501 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21502 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21503
21504 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21505 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21506
21507 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21508 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21509 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21510 SDValue X = Op0.getOperand(0).getOperand(0);
21511 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21512 }
21513 }
21514
21515 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21516 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21517 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21518 SDValue Z = Op1.getOperand(0).getOperand(1);
21519 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21520 }
21521 }
21522
21523 // These optimizations only work on little endian.
21524 if (!DAG.getDataLayout().isLittleEndian())
21525 return SDValue();
21526
21527 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21528 // Example:
21529 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21530 // to
21531 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21533 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21534 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21535 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21536 Op1.getOperand(0));
21537 }
21538 }
21539
21540 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21541 return SDValue();
21542
21543 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21544 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21545
21546 // truncating uzp1(x, y) -> xtn(concat (x, y))
21547 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21548 EVT Op0Ty = SourceOp0.getValueType();
21549 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21550 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21551 SDValue Concat =
21554 SourceOp0, SourceOp1);
21555 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21556 }
21557 }
21558
21559 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21560 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21561 SourceOp1.getOpcode() != ISD::TRUNCATE)
21562 return SDValue();
21563 SourceOp0 = SourceOp0.getOperand(0);
21564 SourceOp1 = SourceOp1.getOperand(0);
21565
21566 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21567 !SourceOp0.getValueType().isSimple())
21568 return SDValue();
21569
21570 EVT ResultTy;
21571
21572 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21573 case MVT::v2i64:
21574 ResultTy = MVT::v4i32;
21575 break;
21576 case MVT::v4i32:
21577 ResultTy = MVT::v8i16;
21578 break;
21579 case MVT::v8i16:
21580 ResultTy = MVT::v16i8;
21581 break;
21582 default:
21583 return SDValue();
21584 }
21585
21586 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21587 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21588 SDValue UzpResult =
21589 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21590
21591 EVT BitcastResultTy;
21592
21593 switch (ResVT.getSimpleVT().SimpleTy) {
21594 case MVT::v2i32:
21595 BitcastResultTy = MVT::v2i64;
21596 break;
21597 case MVT::v4i16:
21598 BitcastResultTy = MVT::v4i32;
21599 break;
21600 case MVT::v8i8:
21601 BitcastResultTy = MVT::v8i16;
21602 break;
21603 default:
21604 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21605 }
21606
21607 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21608 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21609}
21610
21612 unsigned Opc = N->getOpcode();
21613
21614 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21616 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21618 "Invalid opcode.");
21619
21620 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21622 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21624 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21628
21629 SDLoc DL(N);
21630 SDValue Chain = N->getOperand(0);
21631 SDValue Pg = N->getOperand(1);
21632 SDValue Base = N->getOperand(2);
21633 SDValue Offset = N->getOperand(3);
21634 SDValue Ty = N->getOperand(4);
21635
21636 EVT ResVT = N->getValueType(0);
21637
21638 const auto OffsetOpc = Offset.getOpcode();
21639 const bool OffsetIsZExt =
21641 const bool OffsetIsSExt =
21643
21644 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21645 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21646 SDValue ExtPg = Offset.getOperand(0);
21647 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21648 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21649
21650 // If the predicate for the sign- or zero-extended offset is the
21651 // same as the predicate used for this load and the sign-/zero-extension
21652 // was from a 32-bits...
21653 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21654 SDValue UnextendedOffset = Offset.getOperand(1);
21655
21656 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21657 if (Signed)
21658 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21659
21660 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21661 {Chain, Pg, Base, UnextendedOffset, Ty});
21662 }
21663 }
21664
21665 return SDValue();
21666}
21667
21668/// Optimize a vector shift instruction and its operand if shifted out
21669/// bits are not used.
21671 const AArch64TargetLowering &TLI,
21673 assert(N->getOpcode() == AArch64ISD::VASHR ||
21674 N->getOpcode() == AArch64ISD::VLSHR);
21675
21676 SDValue Op = N->getOperand(0);
21677 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21678
21679 unsigned ShiftImm = N->getConstantOperandVal(1);
21680 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21681
21682 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21683 if (N->getOpcode() == AArch64ISD::VASHR &&
21684 Op.getOpcode() == AArch64ISD::VSHL &&
21685 N->getOperand(1) == Op.getOperand(1))
21686 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21687 return Op.getOperand(0);
21688
21689 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21690 APInt DemandedMask = ~ShiftedOutBits;
21691
21692 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21693 return SDValue(N, 0);
21694
21695 return SDValue();
21696}
21697
21699 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21700 // This transform works in partnership with performSetCCPunpkCombine to
21701 // remove unnecessary transfer of predicates into standard registers and back
21702 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21703 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21704 MVT::i1) {
21705 SDValue CC = N->getOperand(0)->getOperand(0);
21706 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21707 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21708 DAG.getVectorIdxConstant(0, SDLoc(N)));
21709 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21710 }
21711
21712 return SDValue();
21713}
21714
21715/// Target-specific DAG combine function for post-increment LD1 (lane) and
21716/// post-increment LD1R.
21719 bool IsLaneOp) {
21720 if (DCI.isBeforeLegalizeOps())
21721 return SDValue();
21722
21723 SelectionDAG &DAG = DCI.DAG;
21724 EVT VT = N->getValueType(0);
21725
21726 if (!VT.is128BitVector() && !VT.is64BitVector())
21727 return SDValue();
21728
21729 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21730 SDNode *LD = N->getOperand(LoadIdx).getNode();
21731 // If it is not LOAD, can not do such combine.
21732 if (LD->getOpcode() != ISD::LOAD)
21733 return SDValue();
21734
21735 // The vector lane must be a constant in the LD1LANE opcode.
21736 SDValue Lane;
21737 if (IsLaneOp) {
21738 Lane = N->getOperand(2);
21739 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21740 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21741 return SDValue();
21742 }
21743
21744 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21745 EVT MemVT = LoadSDN->getMemoryVT();
21746 // Check if memory operand is the same type as the vector element.
21747 if (MemVT != VT.getVectorElementType())
21748 return SDValue();
21749
21750 // Check if there are other uses. If so, do not combine as it will introduce
21751 // an extra load.
21752 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21753 ++UI) {
21754 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21755 continue;
21756 if (*UI != N)
21757 return SDValue();
21758 }
21759
21760 // If there is one use and it can splat the value, prefer that operation.
21761 // TODO: This could be expanded to more operations if they reliably use the
21762 // index variants.
21763 if (N->hasOneUse()) {
21764 unsigned UseOpc = N->use_begin()->getOpcode();
21765 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21766 return SDValue();
21767 }
21768
21769 SDValue Addr = LD->getOperand(1);
21770 SDValue Vector = N->getOperand(0);
21771 // Search for a use of the address operand that is an increment.
21772 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21773 Addr.getNode()->use_end(); UI != UE; ++UI) {
21774 SDNode *User = *UI;
21775 if (User->getOpcode() != ISD::ADD
21776 || UI.getUse().getResNo() != Addr.getResNo())
21777 continue;
21778
21779 // If the increment is a constant, it must match the memory ref size.
21780 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21781 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21782 uint32_t IncVal = CInc->getZExtValue();
21783 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21784 if (IncVal != NumBytes)
21785 continue;
21786 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21787 }
21788
21789 // To avoid cycle construction make sure that neither the load nor the add
21790 // are predecessors to each other or the Vector.
21793 Visited.insert(Addr.getNode());
21794 Worklist.push_back(User);
21795 Worklist.push_back(LD);
21796 Worklist.push_back(Vector.getNode());
21797 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21798 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21799 continue;
21800
21802 Ops.push_back(LD->getOperand(0)); // Chain
21803 if (IsLaneOp) {
21804 Ops.push_back(Vector); // The vector to be inserted
21805 Ops.push_back(Lane); // The lane to be inserted in the vector
21806 }
21807 Ops.push_back(Addr);
21808 Ops.push_back(Inc);
21809
21810 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21811 SDVTList SDTys = DAG.getVTList(Tys);
21812 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21813 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21814 MemVT,
21815 LoadSDN->getMemOperand());
21816
21817 // Update the uses.
21818 SDValue NewResults[] = {
21819 SDValue(LD, 0), // The result of load
21820 SDValue(UpdN.getNode(), 2) // Chain
21821 };
21822 DCI.CombineTo(LD, NewResults);
21823 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21824 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21825
21826 break;
21827 }
21828 return SDValue();
21829}
21830
21831/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21832/// address translation.
21835 SelectionDAG &DAG) {
21836 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21837 KnownBits Known;
21839 !DCI.isBeforeLegalizeOps());
21840 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21841 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21842 DCI.CommitTargetLoweringOpt(TLO);
21843 return true;
21844 }
21845 return false;
21846}
21847
21849 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21850 "Expected STORE dag node in input!");
21851
21852 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21853 if (!Store->isTruncatingStore() || Store->isIndexed())
21854 return SDValue();
21855 SDValue Ext = Store->getValue();
21856 auto ExtOpCode = Ext.getOpcode();
21857 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21858 ExtOpCode != ISD::ANY_EXTEND)
21859 return SDValue();
21860 SDValue Orig = Ext->getOperand(0);
21861 if (Store->getMemoryVT() != Orig.getValueType())
21862 return SDValue();
21863 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21864 Store->getBasePtr(), Store->getMemOperand());
21865 }
21866
21867 return SDValue();
21868}
21869
21870// A custom combine to lower load <3 x i8> as the more efficient sequence
21871// below:
21872// ldrb wX, [x0, #2]
21873// ldrh wY, [x0]
21874// orr wX, wY, wX, lsl #16
21875// fmov s0, wX
21876//
21877// Note that an alternative sequence with even fewer (although usually more
21878// complex/expensive) instructions would be:
21879// ld1r.4h { v0 }, [x0], #2
21880// ld1.b { v0 }[2], [x0]
21881//
21882// Generating this sequence unfortunately results in noticeably worse codegen
21883// for code that extends the loaded v3i8, due to legalization breaking vector
21884// shuffle detection in a way that is very difficult to work around.
21885// TODO: Revisit once v3i8 legalization has been improved in general.
21887 EVT MemVT = LD->getMemoryVT();
21888 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21889 LD->getOriginalAlign() >= 4)
21890 return SDValue();
21891
21892 SDLoc DL(LD);
21894 SDValue Chain = LD->getChain();
21895 SDValue BasePtr = LD->getBasePtr();
21896 MachineMemOperand *MMO = LD->getMemOperand();
21897 assert(LD->getOffset().isUndef() && "undef offset expected");
21898
21899 // Load 2 x i8, then 1 x i8.
21900 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21901 TypeSize Offset2 = TypeSize::getFixed(2);
21902 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21903 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21904 MF.getMachineMemOperand(MMO, 2, 1));
21905
21906 // Extend to i32.
21907 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21908 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21909
21910 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21911 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21912 DAG.getConstant(16, DL, MVT::i32));
21913 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21914 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21915
21916 // Extract v3i8 again.
21917 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21918 DAG.getConstant(0, DL, MVT::i64));
21919 SDValue TokenFactor = DAG.getNode(
21920 ISD::TokenFactor, DL, MVT::Other,
21921 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21922 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21923}
21924
21925// Perform TBI simplification if supported by the target and try to break up
21926// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21927// load instructions can be selected.
21930 SelectionDAG &DAG,
21931 const AArch64Subtarget *Subtarget) {
21932 if (Subtarget->supportsAddressTopByteIgnored())
21933 performTBISimplification(N->getOperand(1), DCI, DAG);
21934
21935 LoadSDNode *LD = cast<LoadSDNode>(N);
21936 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21937 return SDValue(N, 0);
21938
21939 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21940 return Res;
21941
21942 if (!LD->isNonTemporal())
21943 return SDValue(N, 0);
21944
21945 EVT MemVT = LD->getMemoryVT();
21946 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21947 MemVT.getSizeInBits() % 256 == 0 ||
21948 256 % MemVT.getScalarSizeInBits() != 0)
21949 return SDValue(N, 0);
21950
21951 SDLoc DL(LD);
21952 SDValue Chain = LD->getChain();
21953 SDValue BasePtr = LD->getBasePtr();
21954 SDNodeFlags Flags = LD->getFlags();
21956 SmallVector<SDValue, 4> LoadOpsChain;
21957 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21958 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21959 // loads and reduce the amount of load instructions generated.
21960 MVT NewVT =
21962 256 / MemVT.getVectorElementType().getSizeInBits());
21963 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21964 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21965 for (unsigned I = 0; I < Num256Loads; I++) {
21966 unsigned PtrOffset = I * 32;
21967 SDValue NewPtr = DAG.getMemBasePlusOffset(
21968 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21969 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21970 SDValue NewLoad = DAG.getLoad(
21971 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21972 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21973 LoadOps.push_back(NewLoad);
21974 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21975 }
21976
21977 // Process remaining bits of the load operation.
21978 // This is done by creating an UNDEF vector to match the size of the
21979 // 256-bit loads and inserting the remaining load to it. We extract the
21980 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21981 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21982 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21983 MVT RemainingVT = MVT::getVectorVT(
21985 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21986 SDValue NewPtr = DAG.getMemBasePlusOffset(
21987 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21988 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21989 SDValue RemainingLoad =
21990 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21991 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21992 LD->getMemOperand()->getFlags(), LD->getAAInfo());
21993 SDValue UndefVector = DAG.getUNDEF(NewVT);
21994 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21995 SDValue ExtendedReminingLoad =
21996 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21997 {UndefVector, RemainingLoad, InsertIdx});
21998 LoadOps.push_back(ExtendedReminingLoad);
21999 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
22000 EVT ConcatVT =
22002 LoadOps.size() * NewVT.getVectorNumElements());
22003 SDValue ConcatVectors =
22004 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
22005 // Extract the original vector type size.
22006 SDValue ExtractSubVector =
22007 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
22008 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
22009 SDValue TokenFactor =
22010 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
22011 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
22012}
22013
22015 EVT VecVT = Op.getValueType();
22016 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22017 "Need boolean vector type.");
22018
22019 if (Depth > 3)
22021
22022 // We can get the base type from a vector compare or truncate.
22023 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22024 return Op.getOperand(0).getValueType();
22025
22026 // If an operand is a bool vector, continue looking.
22028 for (SDValue Operand : Op->op_values()) {
22029 if (Operand.getValueType() != VecVT)
22030 continue;
22031
22032 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22033 if (!BaseVT.isSimple())
22034 BaseVT = OperandVT;
22035 else if (OperandVT != BaseVT)
22037 }
22038
22039 return BaseVT;
22040}
22041
22042// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22043// iN, we can use a trick that extracts the i^th bit from the i^th element and
22044// then performs a vector add to get a scalar bitmask. This requires that each
22045// element's bits are either all 1 or all 0.
22047 SDLoc DL(N);
22048 SDValue ComparisonResult(N, 0);
22049 EVT VecVT = ComparisonResult.getValueType();
22050 assert(VecVT.isVector() && "Must be a vector type");
22051
22052 unsigned NumElts = VecVT.getVectorNumElements();
22053 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22054 return SDValue();
22055
22056 if (VecVT.getVectorElementType() != MVT::i1 &&
22057 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22058 return SDValue();
22059
22060 // If we can find the original types to work on instead of a vector of i1,
22061 // we can avoid extend/extract conversion instructions.
22062 if (VecVT.getVectorElementType() == MVT::i1) {
22063 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22064 if (!VecVT.isSimple()) {
22065 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22066 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22067 }
22068 }
22069 VecVT = VecVT.changeVectorElementTypeToInteger();
22070
22071 // Large vectors don't map directly to this conversion, so to avoid too many
22072 // edge cases, we don't apply it here. The conversion will likely still be
22073 // applied later via multiple smaller vectors, whose results are concatenated.
22074 if (VecVT.getSizeInBits() > 128)
22075 return SDValue();
22076
22077 // Ensure that all elements' bits are either 0s or 1s.
22078 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22079
22080 SmallVector<SDValue, 16> MaskConstants;
22081 if (VecVT == MVT::v16i8) {
22082 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22083 // per entry. We split it into two halves, apply the mask, zip the halves to
22084 // create 8x 16-bit values, and the perform the vector reduce.
22085 for (unsigned Half = 0; Half < 2; ++Half) {
22086 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22087 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22088 }
22089 }
22090 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22091 SDValue RepresentativeBits =
22092 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22093
22094 SDValue UpperRepresentativeBits =
22095 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22096 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22097 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22098 RepresentativeBits, UpperRepresentativeBits);
22099 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22100 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22101 }
22102
22103 // All other vector sizes.
22104 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22105 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22106 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22107 }
22108
22109 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22110 SDValue RepresentativeBits =
22111 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22112 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22113 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22114 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22115}
22116
22118 StoreSDNode *Store) {
22119 if (!Store->isTruncatingStore())
22120 return SDValue();
22121
22122 SDLoc DL(Store);
22123 SDValue VecOp = Store->getValue();
22124 EVT VT = VecOp.getValueType();
22125 EVT MemVT = Store->getMemoryVT();
22126
22127 if (!MemVT.isVector() || !VT.isVector() ||
22128 MemVT.getVectorElementType() != MVT::i1)
22129 return SDValue();
22130
22131 // If we are storing a vector that we are currently building, let
22132 // `scalarizeVectorStore()` handle this more efficiently.
22133 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22134 return SDValue();
22135
22136 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22137 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22138 if (!VectorBits)
22139 return SDValue();
22140
22141 EVT StoreVT =
22143 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22144 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22145 Store->getMemOperand());
22146}
22147
22149 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22150 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22151 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22152}
22153
22154// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22156 const AArch64Subtarget *Subtarget) {
22157 SDValue Value = ST->getValue();
22158 EVT ValueVT = Value.getValueType();
22159
22160 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22161 Value.getOpcode() != ISD::TRUNCATE ||
22162 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22163 return SDValue();
22164
22165 assert(ST->getOffset().isUndef() && "undef offset expected");
22166 SDLoc DL(ST);
22167 auto WideVT = EVT::getVectorVT(
22168 *DAG.getContext(),
22169 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22170 SDValue UndefVector = DAG.getUNDEF(WideVT);
22171 SDValue WideTrunc = DAG.getNode(
22172 ISD::INSERT_SUBVECTOR, DL, WideVT,
22173 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22174 SDValue Cast = DAG.getNode(
22175 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22176 WideTrunc);
22177
22179 SDValue Chain = ST->getChain();
22180 MachineMemOperand *MMO = ST->getMemOperand();
22181 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22182 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22183 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22184 TypeSize Offset2 = TypeSize::getFixed(2);
22185 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22186 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22187
22188 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22189 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22190 TypeSize Offset1 = TypeSize::getFixed(1);
22191 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22192 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22193
22194 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22195 DAG.getConstant(0, DL, MVT::i64));
22196 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22197 MF.getMachineMemOperand(MMO, 0, 1));
22198 return Chain;
22199}
22200
22203 SelectionDAG &DAG,
22204 const AArch64Subtarget *Subtarget) {
22205 StoreSDNode *ST = cast<StoreSDNode>(N);
22206 SDValue Chain = ST->getChain();
22207 SDValue Value = ST->getValue();
22208 SDValue Ptr = ST->getBasePtr();
22209 EVT ValueVT = Value.getValueType();
22210
22211 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22212 EVT EltVT = VT.getVectorElementType();
22213 return EltVT == MVT::f32 || EltVT == MVT::f64;
22214 };
22215
22216 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22217 return Res;
22218
22219 // If this is an FP_ROUND followed by a store, fold this into a truncating
22220 // store. We can do this even if this is already a truncstore.
22221 // We purposefully don't care about legality of the nodes here as we know
22222 // they can be split down into something legal.
22223 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22224 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22225 Subtarget->useSVEForFixedLengthVectors() &&
22226 ValueVT.isFixedLengthVector() &&
22227 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22228 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22229 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22230 ST->getMemoryVT(), ST->getMemOperand());
22231
22232 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22233 return Split;
22234
22235 if (Subtarget->supportsAddressTopByteIgnored() &&
22236 performTBISimplification(N->getOperand(2), DCI, DAG))
22237 return SDValue(N, 0);
22238
22239 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22240 return Store;
22241
22242 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22243 return Store;
22244
22245 if (ST->isTruncatingStore()) {
22246 EVT StoreVT = ST->getMemoryVT();
22247 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22248 return SDValue();
22249 if (SDValue Rshrnb =
22250 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22251 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22252 StoreVT, ST->getMemOperand());
22253 }
22254 }
22255
22256 return SDValue();
22257}
22258
22261 SelectionDAG &DAG,
22262 const AArch64Subtarget *Subtarget) {
22263 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22264 SDValue Value = MST->getValue();
22265 SDValue Mask = MST->getMask();
22266 SDLoc DL(N);
22267
22268 // If this is a UZP1 followed by a masked store, fold this into a masked
22269 // truncating store. We can do this even if this is already a masked
22270 // truncstore.
22271 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22272 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22273 Value.getValueType().isInteger()) {
22274 Value = Value.getOperand(0);
22275 if (Value.getOpcode() == ISD::BITCAST) {
22276 EVT HalfVT =
22277 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22278 EVT InVT = Value.getOperand(0).getValueType();
22279
22280 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22281 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22282 unsigned PgPattern = Mask->getConstantOperandVal(0);
22283
22284 // Ensure we can double the size of the predicate pattern
22285 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22286 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22287 MinSVESize) {
22288 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22289 PgPattern);
22290 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22291 MST->getBasePtr(), MST->getOffset(), Mask,
22292 MST->getMemoryVT(), MST->getMemOperand(),
22293 MST->getAddressingMode(),
22294 /*IsTruncating=*/true);
22295 }
22296 }
22297 }
22298 }
22299
22300 if (MST->isTruncatingStore()) {
22301 EVT ValueVT = Value->getValueType(0);
22302 EVT MemVT = MST->getMemoryVT();
22303 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22304 return SDValue();
22305 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22306 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22307 MST->getOffset(), MST->getMask(),
22308 MST->getMemoryVT(), MST->getMemOperand(),
22309 MST->getAddressingMode(), true);
22310 }
22311 }
22312
22313 return SDValue();
22314}
22315
22316/// \return true if part of the index was folded into the Base.
22317static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22318 SDLoc DL, SelectionDAG &DAG) {
22319 // This function assumes a vector of i64 indices.
22320 EVT IndexVT = Index.getValueType();
22321 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22322 return false;
22323
22324 // Simplify:
22325 // BasePtr = Ptr
22326 // Index = X + splat(Offset)
22327 // ->
22328 // BasePtr = Ptr + Offset * scale.
22329 // Index = X
22330 if (Index.getOpcode() == ISD::ADD) {
22331 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22332 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22333 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22334 Index = Index.getOperand(0);
22335 return true;
22336 }
22337 }
22338
22339 // Simplify:
22340 // BasePtr = Ptr
22341 // Index = (X + splat(Offset)) << splat(Shift)
22342 // ->
22343 // BasePtr = Ptr + (Offset << Shift) * scale)
22344 // Index = X << splat(shift)
22345 if (Index.getOpcode() == ISD::SHL &&
22346 Index.getOperand(0).getOpcode() == ISD::ADD) {
22347 SDValue Add = Index.getOperand(0);
22348 SDValue ShiftOp = Index.getOperand(1);
22349 SDValue OffsetOp = Add.getOperand(1);
22350 if (auto Shift = DAG.getSplatValue(ShiftOp))
22351 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22352 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22353 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22354 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22355 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22356 Add.getOperand(0), ShiftOp);
22357 return true;
22358 }
22359 }
22360
22361 return false;
22362}
22363
22364// Analyse the specified address returning true if a more optimal addressing
22365// mode is available. When returning true all parameters are updated to reflect
22366// their recommended values.
22368 SDValue &BasePtr, SDValue &Index,
22369 SelectionDAG &DAG) {
22370 // Try to iteratively fold parts of the index into the base pointer to
22371 // simplify the index as much as possible.
22372 bool Changed = false;
22373 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22374 Changed = true;
22375
22376 // Only consider element types that are pointer sized as smaller types can
22377 // be easily promoted.
22378 EVT IndexVT = Index.getValueType();
22379 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22380 return Changed;
22381
22382 // Can indices be trivially shrunk?
22383 EVT DataVT = N->getOperand(1).getValueType();
22384 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22385 // will later be re-extended to 64 bits in legalization
22386 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22387 return Changed;
22388 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22389 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22390 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22391 return true;
22392 }
22393
22394 // Match:
22395 // Index = step(const)
22396 int64_t Stride = 0;
22397 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22398 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22399 }
22400 // Match:
22401 // Index = step(const) << shift(const)
22402 else if (Index.getOpcode() == ISD::SHL &&
22403 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22404 SDValue RHS = Index.getOperand(1);
22405 if (auto *Shift =
22406 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22407 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22408 Stride = Step << Shift->getZExtValue();
22409 }
22410 }
22411
22412 // Return early because no supported pattern is found.
22413 if (Stride == 0)
22414 return Changed;
22415
22416 if (Stride < std::numeric_limits<int32_t>::min() ||
22417 Stride > std::numeric_limits<int32_t>::max())
22418 return Changed;
22419
22420 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22421 unsigned MaxVScale =
22423 int64_t LastElementOffset =
22424 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22425
22426 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22427 LastElementOffset > std::numeric_limits<int32_t>::max())
22428 return Changed;
22429
22430 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22431 // Stride does not scale explicitly by 'Scale', because it happens in
22432 // the gather/scatter addressing mode.
22433 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22434 return true;
22435}
22436
22439 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22440 assert(MGS && "Can only combine gather load or scatter store nodes");
22441
22442 if (!DCI.isBeforeLegalize())
22443 return SDValue();
22444
22445 SDLoc DL(MGS);
22446 SDValue Chain = MGS->getChain();
22447 SDValue Scale = MGS->getScale();
22448 SDValue Index = MGS->getIndex();
22449 SDValue Mask = MGS->getMask();
22450 SDValue BasePtr = MGS->getBasePtr();
22451 ISD::MemIndexType IndexType = MGS->getIndexType();
22452
22453 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22454 return SDValue();
22455
22456 // Here we catch such cases early and change MGATHER's IndexType to allow
22457 // the use of an Index that's more legalisation friendly.
22458 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22459 SDValue PassThru = MGT->getPassThru();
22460 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22461 return DAG.getMaskedGather(
22462 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22463 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22464 }
22465 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22466 SDValue Data = MSC->getValue();
22467 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22468 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22469 Ops, MSC->getMemOperand(), IndexType,
22470 MSC->isTruncatingStore());
22471}
22472
22473/// Target-specific DAG combine function for NEON load/store intrinsics
22474/// to merge base address updates.
22477 SelectionDAG &DAG) {
22478 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22479 return SDValue();
22480
22481 unsigned AddrOpIdx = N->getNumOperands() - 1;
22482 SDValue Addr = N->getOperand(AddrOpIdx);
22483
22484 // Search for a use of the address operand that is an increment.
22485 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22486 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22487 SDNode *User = *UI;
22488 if (User->getOpcode() != ISD::ADD ||
22489 UI.getUse().getResNo() != Addr.getResNo())
22490 continue;
22491
22492 // Check that the add is independent of the load/store. Otherwise, folding
22493 // it would create a cycle.
22496 Visited.insert(Addr.getNode());
22497 Worklist.push_back(N);
22498 Worklist.push_back(User);
22499 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22500 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22501 continue;
22502
22503 // Find the new opcode for the updating load/store.
22504 bool IsStore = false;
22505 bool IsLaneOp = false;
22506 bool IsDupOp = false;
22507 unsigned NewOpc = 0;
22508 unsigned NumVecs = 0;
22509 unsigned IntNo = N->getConstantOperandVal(1);
22510 switch (IntNo) {
22511 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22512 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22513 NumVecs = 2; break;
22514 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22515 NumVecs = 3; break;
22516 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22517 NumVecs = 4; break;
22518 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22519 NumVecs = 2; IsStore = true; break;
22520 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22521 NumVecs = 3; IsStore = true; break;
22522 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22523 NumVecs = 4; IsStore = true; break;
22524 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22525 NumVecs = 2; break;
22526 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22527 NumVecs = 3; break;
22528 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22529 NumVecs = 4; break;
22530 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22531 NumVecs = 2; IsStore = true; break;
22532 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22533 NumVecs = 3; IsStore = true; break;
22534 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22535 NumVecs = 4; IsStore = true; break;
22536 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22537 NumVecs = 2; IsDupOp = true; break;
22538 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22539 NumVecs = 3; IsDupOp = true; break;
22540 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22541 NumVecs = 4; IsDupOp = true; break;
22542 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22543 NumVecs = 2; IsLaneOp = true; break;
22544 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22545 NumVecs = 3; IsLaneOp = true; break;
22546 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22547 NumVecs = 4; IsLaneOp = true; break;
22548 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22549 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22550 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22551 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22552 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22553 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22554 }
22555
22556 EVT VecTy;
22557 if (IsStore)
22558 VecTy = N->getOperand(2).getValueType();
22559 else
22560 VecTy = N->getValueType(0);
22561
22562 // If the increment is a constant, it must match the memory ref size.
22563 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22564 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22565 uint32_t IncVal = CInc->getZExtValue();
22566 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22567 if (IsLaneOp || IsDupOp)
22568 NumBytes /= VecTy.getVectorNumElements();
22569 if (IncVal != NumBytes)
22570 continue;
22571 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22572 }
22574 Ops.push_back(N->getOperand(0)); // Incoming chain
22575 // Load lane and store have vector list as input.
22576 if (IsLaneOp || IsStore)
22577 for (unsigned i = 2; i < AddrOpIdx; ++i)
22578 Ops.push_back(N->getOperand(i));
22579 Ops.push_back(Addr); // Base register
22580 Ops.push_back(Inc);
22581
22582 // Return Types.
22583 EVT Tys[6];
22584 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22585 unsigned n;
22586 for (n = 0; n < NumResultVecs; ++n)
22587 Tys[n] = VecTy;
22588 Tys[n++] = MVT::i64; // Type of write back register
22589 Tys[n] = MVT::Other; // Type of the chain
22590 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22591
22592 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22593 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22594 MemInt->getMemoryVT(),
22595 MemInt->getMemOperand());
22596
22597 // Update the uses.
22598 std::vector<SDValue> NewResults;
22599 for (unsigned i = 0; i < NumResultVecs; ++i) {
22600 NewResults.push_back(SDValue(UpdN.getNode(), i));
22601 }
22602 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22603 DCI.CombineTo(N, NewResults);
22604 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22605
22606 break;
22607 }
22608 return SDValue();
22609}
22610
22611// Checks to see if the value is the prescribed width and returns information
22612// about its extension mode.
22613static
22614bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22615 ExtType = ISD::NON_EXTLOAD;
22616 switch(V.getNode()->getOpcode()) {
22617 default:
22618 return false;
22619 case ISD::LOAD: {
22620 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22621 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22622 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22623 ExtType = LoadNode->getExtensionType();
22624 return true;
22625 }
22626 return false;
22627 }
22628 case ISD::AssertSext: {
22629 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22630 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22631 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22632 ExtType = ISD::SEXTLOAD;
22633 return true;
22634 }
22635 return false;
22636 }
22637 case ISD::AssertZext: {
22638 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22639 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22640 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22641 ExtType = ISD::ZEXTLOAD;
22642 return true;
22643 }
22644 return false;
22645 }
22646 case ISD::Constant:
22647 case ISD::TargetConstant: {
22648 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22649 1LL << (width - 1);
22650 }
22651 }
22652
22653 return true;
22654}
22655
22656// This function does a whole lot of voodoo to determine if the tests are
22657// equivalent without and with a mask. Essentially what happens is that given a
22658// DAG resembling:
22659//
22660// +-------------+ +-------------+ +-------------+ +-------------+
22661// | Input | | AddConstant | | CompConstant| | CC |
22662// +-------------+ +-------------+ +-------------+ +-------------+
22663// | | | |
22664// V V | +----------+
22665// +-------------+ +----+ | |
22666// | ADD | |0xff| | |
22667// +-------------+ +----+ | |
22668// | | | |
22669// V V | |
22670// +-------------+ | |
22671// | AND | | |
22672// +-------------+ | |
22673// | | |
22674// +-----+ | |
22675// | | |
22676// V V V
22677// +-------------+
22678// | CMP |
22679// +-------------+
22680//
22681// The AND node may be safely removed for some combinations of inputs. In
22682// particular we need to take into account the extension type of the Input,
22683// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22684// width of the input (this can work for any width inputs, the above graph is
22685// specific to 8 bits.
22686//
22687// The specific equations were worked out by generating output tables for each
22688// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22689// problem was simplified by working with 4 bit inputs, which means we only
22690// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22691// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22692// patterns present in both extensions (0,7). For every distinct set of
22693// AddConstant and CompConstants bit patterns we can consider the masked and
22694// unmasked versions to be equivalent if the result of this function is true for
22695// all 16 distinct bit patterns of for the current extension type of Input (w0).
22696//
22697// sub w8, w0, w1
22698// and w10, w8, #0x0f
22699// cmp w8, w2
22700// cset w9, AArch64CC
22701// cmp w10, w2
22702// cset w11, AArch64CC
22703// cmp w9, w11
22704// cset w0, eq
22705// ret
22706//
22707// Since the above function shows when the outputs are equivalent it defines
22708// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22709// would be expensive to run during compiles. The equations below were written
22710// in a test harness that confirmed they gave equivalent outputs to the above
22711// for all inputs function, so they can be used determine if the removal is
22712// legal instead.
22713//
22714// isEquivalentMaskless() is the code for testing if the AND can be removed
22715// factored out of the DAG recognition as the DAG can take several forms.
22716
22717static bool isEquivalentMaskless(unsigned CC, unsigned width,
22718 ISD::LoadExtType ExtType, int AddConstant,
22719 int CompConstant) {
22720 // By being careful about our equations and only writing the in term
22721 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22722 // make them generally applicable to all bit widths.
22723 int MaxUInt = (1 << width);
22724
22725 // For the purposes of these comparisons sign extending the type is
22726 // equivalent to zero extending the add and displacing it by half the integer
22727 // width. Provided we are careful and make sure our equations are valid over
22728 // the whole range we can just adjust the input and avoid writing equations
22729 // for sign extended inputs.
22730 if (ExtType == ISD::SEXTLOAD)
22731 AddConstant -= (1 << (width-1));
22732
22733 switch(CC) {
22734 case AArch64CC::LE:
22735 case AArch64CC::GT:
22736 if ((AddConstant == 0) ||
22737 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22738 (AddConstant >= 0 && CompConstant < 0) ||
22739 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22740 return true;
22741 break;
22742 case AArch64CC::LT:
22743 case AArch64CC::GE:
22744 if ((AddConstant == 0) ||
22745 (AddConstant >= 0 && CompConstant <= 0) ||
22746 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22747 return true;
22748 break;
22749 case AArch64CC::HI:
22750 case AArch64CC::LS:
22751 if ((AddConstant >= 0 && CompConstant < 0) ||
22752 (AddConstant <= 0 && CompConstant >= -1 &&
22753 CompConstant < AddConstant + MaxUInt))
22754 return true;
22755 break;
22756 case AArch64CC::PL:
22757 case AArch64CC::MI:
22758 if ((AddConstant == 0) ||
22759 (AddConstant > 0 && CompConstant <= 0) ||
22760 (AddConstant < 0 && CompConstant <= AddConstant))
22761 return true;
22762 break;
22763 case AArch64CC::LO:
22764 case AArch64CC::HS:
22765 if ((AddConstant >= 0 && CompConstant <= 0) ||
22766 (AddConstant <= 0 && CompConstant >= 0 &&
22767 CompConstant <= AddConstant + MaxUInt))
22768 return true;
22769 break;
22770 case AArch64CC::EQ:
22771 case AArch64CC::NE:
22772 if ((AddConstant > 0 && CompConstant < 0) ||
22773 (AddConstant < 0 && CompConstant >= 0 &&
22774 CompConstant < AddConstant + MaxUInt) ||
22775 (AddConstant >= 0 && CompConstant >= 0 &&
22776 CompConstant >= AddConstant) ||
22777 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22778 return true;
22779 break;
22780 case AArch64CC::VS:
22781 case AArch64CC::VC:
22782 case AArch64CC::AL:
22783 case AArch64CC::NV:
22784 return true;
22785 case AArch64CC::Invalid:
22786 break;
22787 }
22788
22789 return false;
22790}
22791
22792// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22793// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22795 SDNode *AndNode, SelectionDAG &DAG,
22796 unsigned CCIndex, unsigned CmpIndex,
22797 unsigned CC) {
22798 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22799 if (!SubsC)
22800 return SDValue();
22801
22802 APInt SubsAP = SubsC->getAPIntValue();
22803 if (CC == AArch64CC::HI) {
22804 if (!SubsAP.isMask())
22805 return SDValue();
22806 } else if (CC == AArch64CC::LO) {
22807 if (!SubsAP.isPowerOf2())
22808 return SDValue();
22809 } else
22810 return SDValue();
22811
22812 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22813 if (!AndC)
22814 return SDValue();
22815
22816 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22817
22818 SDLoc DL(N);
22819 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22820 SDValue ANDS = DAG.getNode(
22821 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22822 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22823 SDValue AArch64_CC =
22825 N->getOperand(CCIndex)->getValueType(0));
22826
22827 // For now, only performCSELCombine and performBRCONDCombine call this
22828 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22829 // operands. So just init the ops direct to simplify the code. If we have some
22830 // other case with different CCIndex, CmpIndex, we need to use for loop to
22831 // rewrite the code here.
22832 // TODO: Do we need to assert number of operand is 4 here?
22833 assert((CCIndex == 2 && CmpIndex == 3) &&
22834 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22835 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22836 ANDS.getValue(1)};
22837 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22838}
22839
22840static
22843 SelectionDAG &DAG, unsigned CCIndex,
22844 unsigned CmpIndex) {
22845 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22846 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22847 unsigned CondOpcode = SubsNode->getOpcode();
22848
22849 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
22850 return SDValue();
22851
22852 // There is a SUBS feeding this condition. Is it fed by a mask we can
22853 // use?
22854
22855 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22856 unsigned MaskBits = 0;
22857
22858 if (AndNode->getOpcode() != ISD::AND)
22859 return SDValue();
22860
22861 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22862 CmpIndex, CC))
22863 return Val;
22864
22865 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22866 uint32_t CNV = CN->getZExtValue();
22867 if (CNV == 255)
22868 MaskBits = 8;
22869 else if (CNV == 65535)
22870 MaskBits = 16;
22871 }
22872
22873 if (!MaskBits)
22874 return SDValue();
22875
22876 SDValue AddValue = AndNode->getOperand(0);
22877
22878 if (AddValue.getOpcode() != ISD::ADD)
22879 return SDValue();
22880
22881 // The basic dag structure is correct, grab the inputs and validate them.
22882
22883 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22884 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22885 SDValue SubsInputValue = SubsNode->getOperand(1);
22886
22887 // The mask is present and the provenance of all the values is a smaller type,
22888 // lets see if the mask is superfluous.
22889
22890 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22891 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22892 return SDValue();
22893
22894 ISD::LoadExtType ExtType;
22895
22896 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22897 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22898 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22899 return SDValue();
22900
22901 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22902 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22903 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22904 return SDValue();
22905
22906 // The AND is not necessary, remove it.
22907
22908 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22909 SubsNode->getValueType(1));
22910 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22911
22912 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22913 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22914
22915 return SDValue(N, 0);
22916}
22917
22918// Optimize compare with zero and branch.
22921 SelectionDAG &DAG) {
22923 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22924 // will not be produced, as they are conditional branch instructions that do
22925 // not set flags.
22926 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22927 return SDValue();
22928
22929 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22930 N = NV.getNode();
22931 SDValue Chain = N->getOperand(0);
22932 SDValue Dest = N->getOperand(1);
22933 SDValue CCVal = N->getOperand(2);
22934 SDValue Cmp = N->getOperand(3);
22935
22936 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22937 unsigned CC = CCVal->getAsZExtVal();
22938 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22939 return SDValue();
22940
22941 unsigned CmpOpc = Cmp.getOpcode();
22942 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22943 return SDValue();
22944
22945 // Only attempt folding if there is only one use of the flag and no use of the
22946 // value.
22947 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22948 return SDValue();
22949
22950 SDValue LHS = Cmp.getOperand(0);
22951 SDValue RHS = Cmp.getOperand(1);
22952
22953 assert(LHS.getValueType() == RHS.getValueType() &&
22954 "Expected the value type to be the same for both operands!");
22955 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22956 return SDValue();
22957
22958 if (isNullConstant(LHS))
22959 std::swap(LHS, RHS);
22960
22961 if (!isNullConstant(RHS))
22962 return SDValue();
22963
22964 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22965 LHS.getOpcode() == ISD::SRL)
22966 return SDValue();
22967
22968 // Fold the compare into the branch instruction.
22969 SDValue BR;
22970 if (CC == AArch64CC::EQ)
22971 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22972 else
22973 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22974
22975 // Do not add new nodes to DAG combiner worklist.
22976 DCI.CombineTo(N, BR, false);
22977
22978 return SDValue();
22979}
22980
22982 unsigned CC = N->getConstantOperandVal(2);
22983 SDValue SUBS = N->getOperand(3);
22984 SDValue Zero, CTTZ;
22985
22986 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22987 Zero = N->getOperand(0);
22988 CTTZ = N->getOperand(1);
22989 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22990 Zero = N->getOperand(1);
22991 CTTZ = N->getOperand(0);
22992 } else
22993 return SDValue();
22994
22995 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22996 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22997 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22998 return SDValue();
22999
23000 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23001 "Illegal type in CTTZ folding");
23002
23003 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
23004 return SDValue();
23005
23006 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23007 ? CTTZ.getOperand(0).getOperand(0)
23008 : CTTZ.getOperand(0);
23009
23010 if (X != SUBS.getOperand(0))
23011 return SDValue();
23012
23013 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23014 ? CTTZ.getOperand(0).getValueSizeInBits()
23015 : CTTZ.getValueSizeInBits();
23016 SDValue BitWidthMinusOne =
23017 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23018 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23019 BitWidthMinusOne);
23020}
23021
23022// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23023// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23024// Where x and y are constants and x != y
23025
23026// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23027// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23028// Where x and y are constants and x != y
23030 SDValue L = Op->getOperand(0);
23031 SDValue R = Op->getOperand(1);
23032 AArch64CC::CondCode OpCC =
23033 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23034
23035 SDValue OpCmp = Op->getOperand(3);
23036 if (!isCMP(OpCmp))
23037 return SDValue();
23038
23039 SDValue CmpLHS = OpCmp.getOperand(0);
23040 SDValue CmpRHS = OpCmp.getOperand(1);
23041
23042 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23043 std::swap(CmpLHS, CmpRHS);
23044 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23045 return SDValue();
23046
23047 SDValue X = CmpLHS->getOperand(0);
23048 SDValue Y = CmpLHS->getOperand(1);
23049 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23050 return SDValue();
23051 }
23052
23053 // If one of the constant is opaque constant, x,y sdnode is still different
23054 // but the real value maybe the same. So check APInt here to make sure the
23055 // code is correct.
23056 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23057 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23058 if (CX->getAPIntValue() == CY->getAPIntValue())
23059 return SDValue();
23060
23062 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23063 SDValue Cond = CmpLHS->getOperand(3);
23064
23065 if (CmpRHS == Y)
23067 else if (CmpRHS != X)
23068 return SDValue();
23069
23070 if (OpCC == AArch64CC::NE)
23072 else if (OpCC != AArch64CC::EQ)
23073 return SDValue();
23074
23075 SDLoc DL(Op);
23076 EVT VT = Op->getValueType(0);
23077
23078 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23079 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23080}
23081
23082// Optimize CSEL instructions
23085 SelectionDAG &DAG) {
23086 // CSEL x, x, cc -> x
23087 if (N->getOperand(0) == N->getOperand(1))
23088 return N->getOperand(0);
23089
23090 if (SDValue R = foldCSELOfCSEL(N, DAG))
23091 return R;
23092
23093 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23094 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23095 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23096 return Folded;
23097
23098 return performCONDCombine(N, DCI, DAG, 2, 3);
23099}
23100
23101// Try to re-use an already extended operand of a vector SetCC feeding a
23102// extended select. Doing so avoids requiring another full extension of the
23103// SET_CC result when lowering the select.
23105 EVT Op0MVT = Op->getOperand(0).getValueType();
23106 if (!Op0MVT.isVector() || Op->use_empty())
23107 return SDValue();
23108
23109 // Make sure that all uses of Op are VSELECTs with result matching types where
23110 // the result type has a larger element type than the SetCC operand.
23111 SDNode *FirstUse = *Op->use_begin();
23112 if (FirstUse->getOpcode() != ISD::VSELECT)
23113 return SDValue();
23114 EVT UseMVT = FirstUse->getValueType(0);
23115 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23116 return SDValue();
23117 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23118 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23119 }))
23120 return SDValue();
23121
23122 APInt V;
23123 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23124 return SDValue();
23125
23126 SDLoc DL(Op);
23127 SDValue Op0ExtV;
23128 SDValue Op1ExtV;
23129 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23130 // Check if the first operand of the SET_CC is already extended. If it is,
23131 // split the SET_CC and re-use the extended version of the operand.
23132 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23133 Op->getOperand(0));
23134 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23135 Op->getOperand(0));
23136 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23137 Op0ExtV = SDValue(Op0SExt, 0);
23138 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23139 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23140 Op0ExtV = SDValue(Op0ZExt, 0);
23141 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23142 } else
23143 return SDValue();
23144
23145 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23146 Op0ExtV, Op1ExtV, Op->getOperand(2));
23147}
23148
23149static SDValue
23151 SelectionDAG &DAG) {
23152 SDValue Vec = N->getOperand(0);
23153 if (DCI.isBeforeLegalize() &&
23154 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23157 SDLoc DL(N);
23158 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23159 DAG);
23160 }
23161
23162 return SDValue();
23163}
23164
23167 SelectionDAG &DAG) {
23168 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23169 SDValue LHS = N->getOperand(0);
23170 SDValue RHS = N->getOperand(1);
23171 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23172 SDLoc DL(N);
23173 EVT VT = N->getValueType(0);
23174
23175 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23176 return V;
23177
23178 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23179 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23180 LHS->getOpcode() == AArch64ISD::CSEL &&
23181 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23182 LHS->hasOneUse()) {
23183 // Invert CSEL's condition.
23184 auto OldCond =
23185 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23186 auto NewCond = getInvertedCondCode(OldCond);
23187
23188 // csel 0, 1, !cond, X
23189 SDValue CSEL =
23190 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23191 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23192 LHS.getOperand(3));
23193 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23194 }
23195
23196 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23197 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23198 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23199 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23200 LHS->hasOneUse()) {
23201 EVT TstVT = LHS->getValueType(0);
23202 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23203 // this pattern will get better opt in emitComparison
23204 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23205 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23206 DAG.getConstant(TstImm, DL, TstVT));
23207 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23208 }
23209 }
23210
23211 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23212 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23213 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23214 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23215 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23216 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23218 LHS->getOpcode() == ISD::BITCAST) {
23219 EVT ToVT = LHS->getValueType(0);
23220 EVT FromVT = LHS->getOperand(0).getValueType();
23221 if (FromVT.isFixedLengthVector() &&
23222 FromVT.getVectorElementType() == MVT::i1) {
23223 bool IsNull = isNullConstant(RHS);
23225 DL, MVT::i1, LHS->getOperand(0));
23226 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23227 LHS);
23228 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23229 }
23230 }
23231
23232 // Try to perform the memcmp when the result is tested for [in]equality with 0
23233 if (SDValue V = performOrXorChainCombine(N, DAG))
23234 return V;
23235
23236 return SDValue();
23237}
23238
23239// Replace a flag-setting operator (eg ANDS) with the generic version
23240// (eg AND) if the flag is unused.
23243 unsigned GenericOpcode) {
23244 SDLoc DL(N);
23245 SDValue LHS = N->getOperand(0);
23246 SDValue RHS = N->getOperand(1);
23247 EVT VT = N->getValueType(0);
23248
23249 // If the flag result isn't used, convert back to a generic opcode.
23250 if (!N->hasAnyUseOfValue(1)) {
23251 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23252 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23253 DL);
23254 }
23255
23256 // Combine identical generic nodes into this node, re-using the result.
23257 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23258 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23259 DCI.CombineTo(Generic, SDValue(N, 0));
23260
23261 return SDValue();
23262}
23263
23265 // setcc_merge_zero pred
23266 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23267 // => extract_subvector (inner setcc_merge_zero)
23268 SDValue Pred = N->getOperand(0);
23269 SDValue LHS = N->getOperand(1);
23270 SDValue RHS = N->getOperand(2);
23271 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23272
23273 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23274 LHS->getOpcode() != ISD::SIGN_EXTEND)
23275 return SDValue();
23276
23277 SDValue Extract = LHS->getOperand(0);
23278 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23279 Extract->getValueType(0) != N->getValueType(0) ||
23280 Extract->getConstantOperandVal(1) != 0)
23281 return SDValue();
23282
23283 SDValue InnerSetCC = Extract->getOperand(0);
23284 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23285 return SDValue();
23286
23287 // By this point we've effectively got
23288 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23289 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23290 // can operate on A directly.
23291 SDValue InnerPred = InnerSetCC.getOperand(0);
23292 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23293 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23294 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23295 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23296 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23297 return Extract;
23298
23299 return SDValue();
23300}
23301
23302static SDValue
23304 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23305 "Unexpected opcode!");
23306
23307 SelectionDAG &DAG = DCI.DAG;
23308 SDValue Pred = N->getOperand(0);
23309 SDValue LHS = N->getOperand(1);
23310 SDValue RHS = N->getOperand(2);
23311 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23312
23313 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23314 return V;
23315
23316 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23317 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23318 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23319 // setcc_merge_zero(
23320 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23321 // => setcc_merge_zero(pred, ...)
23322 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23323 LHS->getOperand(0)->getOperand(0) == Pred)
23324 return LHS->getOperand(0);
23325
23326 // setcc_merge_zero(
23327 // all_active, extend(nxvNi1 ...), != splat(0))
23328 // -> nxvNi1 ...
23329 if (isAllActivePredicate(DAG, Pred))
23330 return LHS->getOperand(0);
23331
23332 // setcc_merge_zero(
23333 // pred, extend(nxvNi1 ...), != splat(0))
23334 // -> nxvNi1 and(pred, ...)
23335 if (DCI.isAfterLegalizeDAG())
23336 // Do this after legalization to allow more folds on setcc_merge_zero
23337 // to be recognized.
23338 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23339 LHS->getOperand(0), Pred);
23340 }
23341
23342 return SDValue();
23343}
23344
23345// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23346// as well as whether the test should be inverted. This code is required to
23347// catch these cases (as opposed to standard dag combines) because
23348// AArch64ISD::TBZ is matched during legalization.
23349static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23350 SelectionDAG &DAG) {
23351
23352 if (!Op->hasOneUse())
23353 return Op;
23354
23355 // We don't handle undef/constant-fold cases below, as they should have
23356 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23357 // etc.)
23358
23359 // (tbz (trunc x), b) -> (tbz x, b)
23360 // This case is just here to enable more of the below cases to be caught.
23361 if (Op->getOpcode() == ISD::TRUNCATE &&
23362 Bit < Op->getValueType(0).getSizeInBits()) {
23363 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23364 }
23365
23366 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23367 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23368 Bit < Op->getOperand(0).getValueSizeInBits()) {
23369 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23370 }
23371
23372 if (Op->getNumOperands() != 2)
23373 return Op;
23374
23375 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23376 if (!C)
23377 return Op;
23378
23379 switch (Op->getOpcode()) {
23380 default:
23381 return Op;
23382
23383 // (tbz (and x, m), b) -> (tbz x, b)
23384 case ISD::AND:
23385 if ((C->getZExtValue() >> Bit) & 1)
23386 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23387 return Op;
23388
23389 // (tbz (shl x, c), b) -> (tbz x, b-c)
23390 case ISD::SHL:
23391 if (C->getZExtValue() <= Bit &&
23392 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23393 Bit = Bit - C->getZExtValue();
23394 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23395 }
23396 return Op;
23397
23398 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23399 case ISD::SRA:
23400 Bit = Bit + C->getZExtValue();
23401 if (Bit >= Op->getValueType(0).getSizeInBits())
23402 Bit = Op->getValueType(0).getSizeInBits() - 1;
23403 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23404
23405 // (tbz (srl x, c), b) -> (tbz x, b+c)
23406 case ISD::SRL:
23407 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23408 Bit = Bit + C->getZExtValue();
23409 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23410 }
23411 return Op;
23412
23413 // (tbz (xor x, -1), b) -> (tbnz x, b)
23414 case ISD::XOR:
23415 if ((C->getZExtValue() >> Bit) & 1)
23416 Invert = !Invert;
23417 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23418 }
23419}
23420
23421// Optimize test single bit zero/non-zero and branch.
23424 SelectionDAG &DAG) {
23425 unsigned Bit = N->getConstantOperandVal(2);
23426 bool Invert = false;
23427 SDValue TestSrc = N->getOperand(1);
23428 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23429
23430 if (TestSrc == NewTestSrc)
23431 return SDValue();
23432
23433 unsigned NewOpc = N->getOpcode();
23434 if (Invert) {
23435 if (NewOpc == AArch64ISD::TBZ)
23436 NewOpc = AArch64ISD::TBNZ;
23437 else {
23438 assert(NewOpc == AArch64ISD::TBNZ);
23439 NewOpc = AArch64ISD::TBZ;
23440 }
23441 }
23442
23443 SDLoc DL(N);
23444 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23445 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23446}
23447
23448// Swap vselect operands where it may allow a predicated operation to achieve
23449// the `sel`.
23450//
23451// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23452// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23454 auto SelectA = N->getOperand(1);
23455 auto SelectB = N->getOperand(2);
23456 auto NTy = N->getValueType(0);
23457
23458 if (!NTy.isScalableVector())
23459 return SDValue();
23460 SDValue SetCC = N->getOperand(0);
23461 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23462 return SDValue();
23463
23464 switch (SelectB.getOpcode()) {
23465 default:
23466 return SDValue();
23467 case ISD::FMUL:
23468 case ISD::FSUB:
23469 case ISD::FADD:
23470 break;
23471 }
23472 if (SelectA != SelectB.getOperand(0))
23473 return SDValue();
23474
23475 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23476 ISD::CondCode InverseCC =
23478 auto InverseSetCC =
23479 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23480 SetCC.getOperand(1), InverseCC);
23481
23482 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23483 {InverseSetCC, SelectB, SelectA});
23484}
23485
23486// vselect (v1i1 setcc) ->
23487// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23488// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23489// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23490// such VSELECT.
23492 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23493 return SwapResult;
23494
23495 SDValue N0 = N->getOperand(0);
23496 EVT CCVT = N0.getValueType();
23497
23498 if (isAllActivePredicate(DAG, N0))
23499 return N->getOperand(1);
23500
23501 if (isAllInactivePredicate(N0))
23502 return N->getOperand(2);
23503
23504 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23505 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23506 // supported types.
23507 SDValue SetCC = N->getOperand(0);
23508 if (SetCC.getOpcode() == ISD::SETCC &&
23509 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23510 SDValue CmpLHS = SetCC.getOperand(0);
23511 EVT VT = CmpLHS.getValueType();
23512 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23513 SDNode *SplatLHS = N->getOperand(1).getNode();
23514 SDNode *SplatRHS = N->getOperand(2).getNode();
23515 APInt SplatLHSVal;
23516 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23517 VT.isSimple() &&
23518 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23519 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23520 VT.getSimpleVT().SimpleTy) &&
23521 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23522 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23524 unsigned NumElts = VT.getVectorNumElements();
23526 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23527 VT.getScalarType()));
23528 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23529
23530 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23531 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23532 return Or;
23533 }
23534 }
23535
23536 EVT CmpVT = N0.getOperand(0).getValueType();
23537 if (N0.getOpcode() != ISD::SETCC ||
23539 CCVT.getVectorElementType() != MVT::i1 ||
23541 return SDValue();
23542
23543 EVT ResVT = N->getValueType(0);
23544 // Only combine when the result type is of the same size as the compared
23545 // operands.
23546 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23547 return SDValue();
23548
23549 SDValue IfTrue = N->getOperand(1);
23550 SDValue IfFalse = N->getOperand(2);
23551 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23552 N0.getOperand(0), N0.getOperand(1),
23553 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23554 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23555 IfTrue, IfFalse);
23556}
23557
23558/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23559/// the compare-mask instructions rather than going via NZCV, even if LHS and
23560/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23561/// with a vector one followed by a DUP shuffle on the result.
23564 SelectionDAG &DAG = DCI.DAG;
23565 SDValue N0 = N->getOperand(0);
23566 EVT ResVT = N->getValueType(0);
23567
23568 if (N0.getOpcode() != ISD::SETCC)
23569 return SDValue();
23570
23571 if (ResVT.isScalableVT())
23572 return SDValue();
23573
23574 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23575 // scalar SetCCResultType. We also don't expect vectors, because we assume
23576 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23577 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23578 "Scalar-SETCC feeding SELECT has unexpected result type!");
23579
23580 // If NumMaskElts == 0, the comparison is larger than select result. The
23581 // largest real NEON comparison is 64-bits per lane, which means the result is
23582 // at most 32-bits and an illegal vector. Just bail out for now.
23583 EVT SrcVT = N0.getOperand(0).getValueType();
23584
23585 // Don't try to do this optimization when the setcc itself has i1 operands.
23586 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23587 // ruled out to prevent the creation of setcc that need to be scalarized.
23588 if (SrcVT == MVT::i1 ||
23589 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23590 return SDValue();
23591
23592 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23593 if (!ResVT.isVector() || NumMaskElts == 0)
23594 return SDValue();
23595
23596 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23598
23599 // Also bail out if the vector CCVT isn't the same size as ResVT.
23600 // This can happen if the SETCC operand size doesn't divide the ResVT size
23601 // (e.g., f64 vs v3f32).
23602 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23603 return SDValue();
23604
23605 // Make sure we didn't create illegal types, if we're not supposed to.
23606 assert(DCI.isBeforeLegalize() ||
23607 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23608
23609 // First perform a vector comparison, where lane 0 is the one we're interested
23610 // in.
23611 SDLoc DL(N0);
23612 SDValue LHS =
23613 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23614 SDValue RHS =
23615 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23616 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23617
23618 // Now duplicate the comparison mask we want across all other lanes.
23619 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23620 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23621 Mask = DAG.getNode(ISD::BITCAST, DL,
23622 ResVT.changeVectorElementTypeToInteger(), Mask);
23623
23624 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23625}
23626
23629 EVT VT = N->getValueType(0);
23630 SDLoc DL(N);
23631 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23632 // 128bit vector version.
23633 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23635 SmallVector<SDValue> Ops(N->ops());
23636 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23637 DCI.DAG.getVTList(LVT), Ops)) {
23638 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23639 DCI.DAG.getConstant(0, DL, MVT::i64));
23640 }
23641 }
23642
23643 if (N->getOpcode() == AArch64ISD::DUP) {
23644 if (DCI.isAfterLegalizeDAG()) {
23645 // If scalar dup's operand is extract_vector_elt, try to combine them into
23646 // duplane. For example,
23647 //
23648 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23649 // t18: v4i32 = AArch64ISD::DUP t21
23650 // ==>
23651 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23652 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23653 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23654 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23655 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23656 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23657 EXTRACT_VEC_ELT.getOperand(1));
23658 }
23659 }
23660 }
23661
23662 return performPostLD1Combine(N, DCI, false);
23663 }
23664
23665 return SDValue();
23666}
23667
23668/// Get rid of unnecessary NVCASTs (that don't change the type).
23670 if (N->getValueType(0) == N->getOperand(0).getValueType())
23671 return N->getOperand(0);
23672 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23673 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23674 N->getOperand(0).getOperand(0));
23675
23676 return SDValue();
23677}
23678
23679// If all users of the globaladdr are of the form (globaladdr + constant), find
23680// the smallest constant, fold it into the globaladdr's offset and rewrite the
23681// globaladdr as (globaladdr + constant) - constant.
23683 const AArch64Subtarget *Subtarget,
23684 const TargetMachine &TM) {
23685 auto *GN = cast<GlobalAddressSDNode>(N);
23686 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23688 return SDValue();
23689
23690 uint64_t MinOffset = -1ull;
23691 for (SDNode *N : GN->uses()) {
23692 if (N->getOpcode() != ISD::ADD)
23693 return SDValue();
23694 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23695 if (!C)
23696 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23697 if (!C)
23698 return SDValue();
23699 MinOffset = std::min(MinOffset, C->getZExtValue());
23700 }
23701 uint64_t Offset = MinOffset + GN->getOffset();
23702
23703 // Require that the new offset is larger than the existing one. Otherwise, we
23704 // can end up oscillating between two possible DAGs, for example,
23705 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23706 if (Offset <= uint64_t(GN->getOffset()))
23707 return SDValue();
23708
23709 // Check whether folding this offset is legal. It must not go out of bounds of
23710 // the referenced object to avoid violating the code model, and must be
23711 // smaller than 2^20 because this is the largest offset expressible in all
23712 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23713 // stores an immediate signed 21 bit offset.)
23714 //
23715 // This check also prevents us from folding negative offsets, which will end
23716 // up being treated in the same way as large positive ones. They could also
23717 // cause code model violations, and aren't really common enough to matter.
23718 if (Offset >= (1 << 20))
23719 return SDValue();
23720
23721 const GlobalValue *GV = GN->getGlobal();
23722 Type *T = GV->getValueType();
23723 if (!T->isSized() ||
23725 return SDValue();
23726
23727 SDLoc DL(GN);
23728 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23729 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23730 DAG.getConstant(MinOffset, DL, MVT::i64));
23731}
23732
23734 const AArch64Subtarget *Subtarget) {
23735 SDValue BR = N->getOperand(0);
23736 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23737 !BR.getValueType().isScalarInteger())
23738 return SDValue();
23739
23740 SDLoc DL(N);
23741 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23742}
23743
23744// Turns the vector of indices into a vector of byte offstes by scaling Offset
23745// by (BitWidth / 8).
23747 SDLoc DL, unsigned BitWidth) {
23748 assert(Offset.getValueType().isScalableVector() &&
23749 "This method is only for scalable vectors of offsets");
23750
23751 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23752 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23753
23754 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23755}
23756
23757/// Check if the value of \p OffsetInBytes can be used as an immediate for
23758/// the gather load/prefetch and scatter store instructions with vector base and
23759/// immediate offset addressing mode:
23760///
23761/// [<Zn>.[S|D]{, #<imm>}]
23762///
23763/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23764inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23765 unsigned ScalarSizeInBytes) {
23766 // The immediate is not a multiple of the scalar size.
23767 if (OffsetInBytes % ScalarSizeInBytes)
23768 return false;
23769
23770 // The immediate is out of range.
23771 if (OffsetInBytes / ScalarSizeInBytes > 31)
23772 return false;
23773
23774 return true;
23775}
23776
23777/// Check if the value of \p Offset represents a valid immediate for the SVE
23778/// gather load/prefetch and scatter store instructiona with vector base and
23779/// immediate offset addressing mode:
23780///
23781/// [<Zn>.[S|D]{, #<imm>}]
23782///
23783/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23785 unsigned ScalarSizeInBytes) {
23786 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23787 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23788 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23789}
23790
23792 unsigned Opcode,
23793 bool OnlyPackedOffsets = true) {
23794 const SDValue Src = N->getOperand(2);
23795 const EVT SrcVT = Src->getValueType(0);
23796 assert(SrcVT.isScalableVector() &&
23797 "Scatter stores are only possible for SVE vectors");
23798
23799 SDLoc DL(N);
23800 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23801
23802 // Make sure that source data will fit into an SVE register
23804 return SDValue();
23805
23806 // For FPs, ACLE only supports _packed_ single and double precision types.
23807 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23808 if (SrcElVT.isFloatingPoint())
23809 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23810 ((Opcode != AArch64ISD::SST1Q_PRED &&
23811 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23812 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23813 return SDValue();
23814
23815 // Depending on the addressing mode, this is either a pointer or a vector of
23816 // pointers (that fits into one register)
23817 SDValue Base = N->getOperand(4);
23818 // Depending on the addressing mode, this is either a single offset or a
23819 // vector of offsets (that fits into one register)
23820 SDValue Offset = N->getOperand(5);
23821
23822 // For "scalar + vector of indices", just scale the indices. This only
23823 // applies to non-temporal scatters because there's no instruction that takes
23824 // indicies.
23825 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23826 Offset =
23828 Opcode = AArch64ISD::SSTNT1_PRED;
23829 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23830 Offset =
23832 Opcode = AArch64ISD::SST1Q_PRED;
23833 }
23834
23835 // In the case of non-temporal gather loads there's only one SVE instruction
23836 // per data-size: "scalar + vector", i.e.
23837 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23838 // Since we do have intrinsics that allow the arguments to be in a different
23839 // order, we may need to swap them to match the spec.
23840 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23841 Offset.getValueType().isVector())
23843
23844 // SST1_IMM requires that the offset is an immediate that is:
23845 // * a multiple of #SizeInBytes,
23846 // * in the range [0, 31 x #SizeInBytes],
23847 // where #SizeInBytes is the size in bytes of the stored items. For
23848 // immediates outside that range and non-immediate scalar offsets use SST1 or
23849 // SST1_UXTW instead.
23850 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23852 SrcVT.getScalarSizeInBits() / 8)) {
23853 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23855 else
23856 Opcode = AArch64ISD::SST1_PRED;
23857
23859 }
23860 }
23861
23862 auto &TLI = DAG.getTargetLoweringInfo();
23863 if (!TLI.isTypeLegal(Base.getValueType()))
23864 return SDValue();
23865
23866 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23867 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23868 // nxv2i64. Legalize accordingly.
23869 if (!OnlyPackedOffsets &&
23870 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23871 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23872
23873 if (!TLI.isTypeLegal(Offset.getValueType()))
23874 return SDValue();
23875
23876 // Source value type that is representable in hardware
23877 EVT HwSrcVt = getSVEContainerType(SrcVT);
23878
23879 // Keep the original type of the input data to store - this is needed to be
23880 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23881 // FP values we want the integer equivalent, so just use HwSrcVt.
23882 SDValue InputVT = DAG.getValueType(SrcVT);
23883 if (SrcVT.isFloatingPoint())
23884 InputVT = DAG.getValueType(HwSrcVt);
23885
23886 SDVTList VTs = DAG.getVTList(MVT::Other);
23887 SDValue SrcNew;
23888
23889 if (Src.getValueType().isFloatingPoint())
23890 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23891 else
23892 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23893
23894 SDValue Ops[] = {N->getOperand(0), // Chain
23895 SrcNew,
23896 N->getOperand(3), // Pg
23897 Base,
23898 Offset,
23899 InputVT};
23900
23901 return DAG.getNode(Opcode, DL, VTs, Ops);
23902}
23903
23905 unsigned Opcode,
23906 bool OnlyPackedOffsets = true) {
23907 const EVT RetVT = N->getValueType(0);
23908 assert(RetVT.isScalableVector() &&
23909 "Gather loads are only possible for SVE vectors");
23910
23911 SDLoc DL(N);
23912
23913 // Make sure that the loaded data will fit into an SVE register
23915 return SDValue();
23916
23917 // Depending on the addressing mode, this is either a pointer or a vector of
23918 // pointers (that fits into one register)
23919 SDValue Base = N->getOperand(3);
23920 // Depending on the addressing mode, this is either a single offset or a
23921 // vector of offsets (that fits into one register)
23922 SDValue Offset = N->getOperand(4);
23923
23924 // For "scalar + vector of indices", scale the indices to obtain unscaled
23925 // offsets. This applies to non-temporal and quadword gathers, which do not
23926 // have an addressing mode with scaled offset.
23929 RetVT.getScalarSizeInBits());
23931 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23933 RetVT.getScalarSizeInBits());
23935 }
23936
23937 // In the case of non-temporal gather loads and quadword gather loads there's
23938 // only one addressing mode : "vector + scalar", e.g.
23939 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23940 // Since we do have intrinsics that allow the arguments to be in a different
23941 // order, we may need to swap them to match the spec.
23942 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23943 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23944 Offset.getValueType().isVector())
23946
23947 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23948 // * a multiple of #SizeInBytes,
23949 // * in the range [0, 31 x #SizeInBytes],
23950 // where #SizeInBytes is the size in bytes of the loaded items. For
23951 // immediates outside that range and non-immediate scalar offsets use
23952 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23953 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23956 RetVT.getScalarSizeInBits() / 8)) {
23957 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23958 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23961 else
23962 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23965
23967 }
23968 }
23969
23970 auto &TLI = DAG.getTargetLoweringInfo();
23971 if (!TLI.isTypeLegal(Base.getValueType()))
23972 return SDValue();
23973
23974 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23975 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23976 // nxv2i64. Legalize accordingly.
23977 if (!OnlyPackedOffsets &&
23978 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23979 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23980
23981 // Return value type that is representable in hardware
23982 EVT HwRetVt = getSVEContainerType(RetVT);
23983
23984 // Keep the original output value type around - this is needed to be able to
23985 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23986 // values we want the integer equivalent, so just use HwRetVT.
23987 SDValue OutVT = DAG.getValueType(RetVT);
23988 if (RetVT.isFloatingPoint())
23989 OutVT = DAG.getValueType(HwRetVt);
23990
23991 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23992 SDValue Ops[] = {N->getOperand(0), // Chain
23993 N->getOperand(2), // Pg
23994 Base, Offset, OutVT};
23995
23996 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23997 SDValue LoadChain = SDValue(Load.getNode(), 1);
23998
23999 if (RetVT.isInteger() && (RetVT != HwRetVt))
24000 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
24001
24002 // If the original return value was FP, bitcast accordingly. Doing it here
24003 // means that we can avoid adding TableGen patterns for FPs.
24004 if (RetVT.isFloatingPoint())
24005 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
24006
24007 return DAG.getMergeValues({Load, LoadChain}, DL);
24008}
24009
24010static SDValue
24012 SelectionDAG &DAG) {
24013 SDLoc DL(N);
24014 SDValue Src = N->getOperand(0);
24015 unsigned Opc = Src->getOpcode();
24016
24017 // Sign extend of an unsigned unpack -> signed unpack
24018 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24019
24020 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24022
24023 // Push the sign extend to the operand of the unpack
24024 // This is necessary where, for example, the operand of the unpack
24025 // is another unpack:
24026 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24027 // ->
24028 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24029 // ->
24030 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24031 SDValue ExtOp = Src->getOperand(0);
24032 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24033 EVT EltTy = VT.getVectorElementType();
24034 (void)EltTy;
24035
24036 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24037 "Sign extending from an invalid type");
24038
24039 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24040
24042 ExtOp, DAG.getValueType(ExtVT));
24043
24044 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24045 }
24046
24047 if (DCI.isBeforeLegalizeOps())
24048 return SDValue();
24049
24051 return SDValue();
24052
24053 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24054 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24055 unsigned NewOpc;
24056 unsigned MemVTOpNum = 4;
24057 switch (Opc) {
24060 MemVTOpNum = 3;
24061 break;
24064 MemVTOpNum = 3;
24065 break;
24068 MemVTOpNum = 3;
24069 break;
24072 break;
24075 break;
24078 break;
24081 break;
24084 break;
24087 break;
24090 break;
24093 break;
24096 break;
24099 break;
24102 break;
24105 break;
24108 break;
24111 break;
24114 break;
24115 default:
24116 return SDValue();
24117 }
24118
24119 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24120 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24121
24122 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24123 return SDValue();
24124
24125 EVT DstVT = N->getValueType(0);
24126 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24127
24129 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24130 Ops.push_back(Src->getOperand(I));
24131
24132 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24133 DCI.CombineTo(N, ExtLoad);
24134 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24135
24136 // Return N so it doesn't get rechecked
24137 return SDValue(N, 0);
24138}
24139
24140/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24141/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24142/// != nxv2i32) do not need legalization.
24144 const unsigned OffsetPos = 4;
24145 SDValue Offset = N->getOperand(OffsetPos);
24146
24147 // Not an unpacked vector, bail out.
24148 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24149 return SDValue();
24150
24151 // Extend the unpacked offset vector to 64-bit lanes.
24152 SDLoc DL(N);
24153 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24154 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24155 // Replace the offset operand with the 64-bit one.
24156 Ops[OffsetPos] = Offset;
24157
24158 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24159}
24160
24161/// Combines a node carrying the intrinsic
24162/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24163/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24164/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24165/// sve gather prefetch instruction with vector plus immediate addressing mode.
24167 unsigned ScalarSizeInBytes) {
24168 const unsigned ImmPos = 4, OffsetPos = 3;
24169 // No need to combine the node if the immediate is valid...
24170 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24171 return SDValue();
24172
24173 // ...otherwise swap the offset base with the offset...
24174 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24175 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24176 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24177 // `aarch64_sve_prfb_gather_uxtw_index`.
24178 SDLoc DL(N);
24179 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24180 MVT::i64);
24181
24182 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24183}
24184
24185// Return true if the vector operation can guarantee only the first lane of its
24186// result contains data, with all bits in other lanes set to zero.
24188 switch (Op.getOpcode()) {
24189 default:
24190 return false;
24206 return true;
24207 }
24208}
24209
24211 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24212 SDValue InsertVec = N->getOperand(0);
24213 SDValue InsertElt = N->getOperand(1);
24214 SDValue InsertIdx = N->getOperand(2);
24215
24216 // We only care about inserts into the first element...
24217 if (!isNullConstant(InsertIdx))
24218 return SDValue();
24219 // ...of a zero'd vector...
24221 return SDValue();
24222 // ...where the inserted data was previously extracted...
24223 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24224 return SDValue();
24225
24226 SDValue ExtractVec = InsertElt.getOperand(0);
24227 SDValue ExtractIdx = InsertElt.getOperand(1);
24228
24229 // ...from the first element of a vector.
24230 if (!isNullConstant(ExtractIdx))
24231 return SDValue();
24232
24233 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24234
24235 // Ensure there's no type conversion going on.
24236 if (N->getValueType(0) != ExtractVec.getValueType())
24237 return SDValue();
24238
24239 if (!isLanes1toNKnownZero(ExtractVec))
24240 return SDValue();
24241
24242 // The explicit zeroing is redundant.
24243 return ExtractVec;
24244}
24245
24246static SDValue
24249 return Res;
24250
24251 return performPostLD1Combine(N, DCI, true);
24252}
24253
24255 EVT Ty = N->getValueType(0);
24256 if (Ty.isInteger())
24257 return SDValue();
24258
24261 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24263 return SDValue();
24264
24265 SDLoc DL(N);
24266 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
24267 DL, ExtIntTy);
24268 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
24269 DL, ExtIntTy);
24270 SDValue Idx = N->getOperand(2);
24271 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
24272 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
24273 return DAG.getBitcast(Ty, Trunc);
24274}
24275
24278 const AArch64Subtarget *Subtarget) {
24279 SDValue N0 = N->getOperand(0);
24280 EVT VT = N->getValueType(0);
24281
24282 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24283 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24284 return SDValue();
24285
24286 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24287 EVT EltVT = VT.getVectorElementType();
24288 return EltVT == MVT::f32 || EltVT == MVT::f64;
24289 };
24290
24291 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24292 // We purposefully don't care about legality of the nodes here as we know
24293 // they can be split down into something legal.
24294 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24295 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24296 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24297 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24298 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24299 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24300 LN0->getChain(), LN0->getBasePtr(),
24301 N0.getValueType(), LN0->getMemOperand());
24302 DCI.CombineTo(N, ExtLoad);
24303 DCI.CombineTo(
24304 N0.getNode(),
24305 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24306 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24307 ExtLoad.getValue(1));
24308 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24309 }
24310
24311 return SDValue();
24312}
24313
24315 const AArch64Subtarget *Subtarget) {
24316 EVT VT = N->getValueType(0);
24317
24318 // Don't expand for NEON, SVE2 or SME
24319 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24320 return SDValue();
24321
24322 SDLoc DL(N);
24323
24324 SDValue Mask = N->getOperand(0);
24325 SDValue In1 = N->getOperand(1);
24326 SDValue In2 = N->getOperand(2);
24327
24328 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24329 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24330 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24331 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24332}
24333
24335 EVT VT = N->getValueType(0);
24336
24337 SDValue Insert = N->getOperand(0);
24338 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24339 return SDValue();
24340
24341 if (!Insert.getOperand(0).isUndef())
24342 return SDValue();
24343
24344 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24345 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24346 if (IdxInsert != 0 || IdxDupLane != 0)
24347 return SDValue();
24348
24349 SDValue Bitcast = Insert.getOperand(1);
24350 if (Bitcast.getOpcode() != ISD::BITCAST)
24351 return SDValue();
24352
24353 SDValue Subvec = Bitcast.getOperand(0);
24354 EVT SubvecVT = Subvec.getValueType();
24355 if (!SubvecVT.is128BitVector())
24356 return SDValue();
24357 EVT NewSubvecVT =
24359
24360 SDLoc DL(N);
24361 SDValue NewInsert =
24362 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24363 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24364 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24365 NewInsert, N->getOperand(1));
24366 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24367}
24368
24369// Try to combine mull with uzp1.
24372 SelectionDAG &DAG) {
24373 if (DCI.isBeforeLegalizeOps())
24374 return SDValue();
24375
24376 SDValue LHS = N->getOperand(0);
24377 SDValue RHS = N->getOperand(1);
24378
24379 SDValue ExtractHigh;
24380 SDValue ExtractLow;
24381 SDValue TruncHigh;
24382 SDValue TruncLow;
24383 SDLoc DL(N);
24384
24385 // Check the operands are trunc and extract_high.
24387 RHS.getOpcode() == ISD::TRUNCATE) {
24388 TruncHigh = RHS;
24389 if (LHS.getOpcode() == ISD::BITCAST)
24390 ExtractHigh = LHS.getOperand(0);
24391 else
24392 ExtractHigh = LHS;
24394 LHS.getOpcode() == ISD::TRUNCATE) {
24395 TruncHigh = LHS;
24396 if (LHS.getOpcode() == ISD::BITCAST)
24397 ExtractHigh = RHS.getOperand(0);
24398 else
24399 ExtractHigh = RHS;
24400 } else
24401 return SDValue();
24402
24403 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24404 // with uzp1.
24405 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24406 SDValue TruncHighOp = TruncHigh.getOperand(0);
24407 EVT TruncHighOpVT = TruncHighOp.getValueType();
24408 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24409 DAG.isSplatValue(TruncHighOp, false))
24410 return SDValue();
24411
24412 // Check there is other extract_high with same source vector.
24413 // For example,
24414 //
24415 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24416 // t12: v4i16 = truncate t11
24417 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24418 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24419 // t16: v4i16 = truncate t15
24420 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24421 //
24422 // This dagcombine assumes the two extract_high uses same source vector in
24423 // order to detect the pair of the mull. If they have different source vector,
24424 // this code will not work.
24425 bool HasFoundMULLow = true;
24426 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24427 if (ExtractHighSrcVec->use_size() != 2)
24428 HasFoundMULLow = false;
24429
24430 // Find ExtractLow.
24431 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24432 if (User == ExtractHigh.getNode())
24433 continue;
24434
24435 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24437 HasFoundMULLow = false;
24438 break;
24439 }
24440
24441 ExtractLow.setNode(User);
24442 }
24443
24444 if (!ExtractLow || !ExtractLow->hasOneUse())
24445 HasFoundMULLow = false;
24446
24447 // Check ExtractLow's user.
24448 if (HasFoundMULLow) {
24449 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24450 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24451 HasFoundMULLow = false;
24452 } else {
24453 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24454 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24455 TruncLow = ExtractLowUser->getOperand(1);
24456 else
24457 HasFoundMULLow = false;
24458 } else {
24459 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24460 TruncLow = ExtractLowUser->getOperand(0);
24461 else
24462 HasFoundMULLow = false;
24463 }
24464 }
24465 }
24466
24467 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24468 // with uzp1.
24469 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24470 EVT TruncHighVT = TruncHigh.getValueType();
24471 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24472 SDValue TruncLowOp =
24473 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24474 EVT TruncLowOpVT = TruncLowOp.getValueType();
24475 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24476 DAG.isSplatValue(TruncLowOp, false)))
24477 return SDValue();
24478
24479 // Create uzp1, extract_high and extract_low.
24480 if (TruncHighOpVT != UZP1VT)
24481 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24482 if (TruncLowOpVT != UZP1VT)
24483 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24484
24485 SDValue UZP1 =
24486 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24487 SDValue HighIdxCst =
24488 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24489 SDValue NewTruncHigh =
24490 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24491 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24492
24493 if (HasFoundMULLow) {
24494 EVT TruncLowVT = TruncLow.getValueType();
24495 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24496 UZP1, ExtractLow.getOperand(1));
24497 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24498 }
24499
24500 return SDValue(N, 0);
24501}
24502
24505 SelectionDAG &DAG) {
24506 if (SDValue Val =
24508 return Val;
24509
24510 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24511 return Val;
24512
24513 return SDValue();
24514}
24515
24516static SDValue
24518 SelectionDAG &DAG) {
24519 // Let's do below transform.
24520 //
24521 // t34: v4i32 = AArch64ISD::UADDLV t2
24522 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24523 // t7: i64 = zero_extend t35
24524 // t20: v1i64 = scalar_to_vector t7
24525 // ==>
24526 // t34: v4i32 = AArch64ISD::UADDLV t2
24527 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24528 // t40: v1i64 = AArch64ISD::NVCAST t39
24529 if (DCI.isBeforeLegalizeOps())
24530 return SDValue();
24531
24532 EVT VT = N->getValueType(0);
24533 if (VT != MVT::v1i64)
24534 return SDValue();
24535
24536 SDValue ZEXT = N->getOperand(0);
24537 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24538 return SDValue();
24539
24540 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24541 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24542 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24543 return SDValue();
24544
24545 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24546 return SDValue();
24547
24548 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24549 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24550 UADDLV.getValueType() != MVT::v4i32 ||
24551 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24552 return SDValue();
24553
24554 // Let's generate new sequence with AArch64ISD::NVCAST.
24555 SDLoc DL(N);
24556 SDValue EXTRACT_SUBVEC =
24557 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24558 DAG.getConstant(0, DL, MVT::i64));
24559 SDValue NVCAST =
24560 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24561
24562 return NVCAST;
24563}
24564
24566 DAGCombinerInfo &DCI) const {
24567 SelectionDAG &DAG = DCI.DAG;
24568 switch (N->getOpcode()) {
24569 default:
24570 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24571 break;
24572 case ISD::VECREDUCE_AND:
24573 case ISD::VECREDUCE_OR:
24574 case ISD::VECREDUCE_XOR:
24575 return performVecReduceBitwiseCombine(N, DCI, DAG);
24576 case ISD::ADD:
24577 case ISD::SUB:
24578 return performAddSubCombine(N, DCI);
24579 case ISD::BUILD_VECTOR:
24580 return performBuildVectorCombine(N, DCI, DAG);
24581 case ISD::TRUNCATE:
24582 return performTruncateCombine(N, DAG);
24583 case AArch64ISD::ANDS:
24584 return performFlagSettingCombine(N, DCI, ISD::AND);
24585 case AArch64ISD::ADC:
24586 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24587 return R;
24588 return foldADCToCINC(N, DAG);
24589 case AArch64ISD::SBC:
24590 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24591 case AArch64ISD::ADCS:
24592 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24593 return R;
24595 case AArch64ISD::SBCS:
24596 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24597 return R;
24599 case AArch64ISD::BICi: {
24601 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
24602 APInt DemandedElts =
24603 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
24604
24606 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24607 return SDValue();
24608
24609 break;
24610 }
24611 case ISD::XOR:
24612 return performXorCombine(N, DAG, DCI, Subtarget);
24613 case ISD::MUL:
24614 return performMulCombine(N, DAG, DCI, Subtarget);
24615 case ISD::SINT_TO_FP:
24616 case ISD::UINT_TO_FP:
24617 return performIntToFpCombine(N, DAG, Subtarget);
24618 case ISD::FP_TO_SINT:
24619 case ISD::FP_TO_UINT:
24622 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24623 case ISD::FDIV:
24624 return performFDivCombine(N, DAG, DCI, Subtarget);
24625 case ISD::OR:
24626 return performORCombine(N, DCI, Subtarget, *this);
24627 case ISD::AND:
24628 return performANDCombine(N, DCI);
24629 case ISD::FADD:
24630 return performFADDCombine(N, DCI);
24632 return performIntrinsicCombine(N, DCI, Subtarget);
24633 case ISD::ANY_EXTEND:
24634 case ISD::ZERO_EXTEND:
24635 case ISD::SIGN_EXTEND:
24636 return performExtendCombine(N, DCI, DAG);
24638 return performSignExtendInRegCombine(N, DCI, DAG);
24640 return performConcatVectorsCombine(N, DCI, DAG);
24642 return performExtractSubvectorCombine(N, DCI, DAG);
24644 return performInsertSubvectorCombine(N, DCI, DAG);
24645 case ISD::SELECT:
24646 return performSelectCombine(N, DCI);
24647 case ISD::VSELECT:
24648 return performVSelectCombine(N, DCI.DAG);
24649 case ISD::SETCC:
24650 return performSETCCCombine(N, DCI, DAG);
24651 case ISD::LOAD:
24652 return performLOADCombine(N, DCI, DAG, Subtarget);
24653 case ISD::STORE:
24654 return performSTORECombine(N, DCI, DAG, Subtarget);
24655 case ISD::MSTORE:
24656 return performMSTORECombine(N, DCI, DAG, Subtarget);
24657 case ISD::MGATHER:
24658 case ISD::MSCATTER:
24659 return performMaskedGatherScatterCombine(N, DCI, DAG);
24660 case ISD::VECTOR_SPLICE:
24661 return performSVESpliceCombine(N, DAG);
24662 case ISD::FP_EXTEND:
24663 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24664 case AArch64ISD::BRCOND:
24665 return performBRCONDCombine(N, DCI, DAG);
24666 case AArch64ISD::TBNZ:
24667 case AArch64ISD::TBZ:
24668 return performTBZCombine(N, DCI, DAG);
24669 case AArch64ISD::CSEL:
24670 return performCSELCombine(N, DCI, DAG);
24671 case AArch64ISD::DUP:
24676 return performDUPCombine(N, DCI);
24678 return performDupLane128Combine(N, DAG);
24679 case AArch64ISD::NVCAST:
24680 return performNVCASTCombine(N, DAG);
24681 case AArch64ISD::SPLICE:
24682 return performSpliceCombine(N, DAG);
24685 return performUnpackCombine(N, DAG, Subtarget);
24686 case AArch64ISD::UZP1:
24687 return performUzpCombine(N, DAG, Subtarget);
24689 return performSetccMergeZeroCombine(N, DCI);
24706 return performGLD1Combine(N, DAG);
24707 case AArch64ISD::VASHR:
24708 case AArch64ISD::VLSHR:
24709 return performVectorShiftCombine(N, *this, DCI);
24711 return performSunpkloCombine(N, DAG);
24712 case AArch64ISD::BSP:
24713 return performBSPExpandForSVE(N, DAG, Subtarget);
24715 return performInsertVectorEltCombine(N, DCI);
24717 return performExtractVectorEltCombine(N, DCI, Subtarget);
24718 case ISD::VECREDUCE_ADD:
24719 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24720 case AArch64ISD::UADDV:
24721 return performUADDVCombine(N, DAG);
24722 case AArch64ISD::SMULL:
24723 case AArch64ISD::UMULL:
24724 case AArch64ISD::PMULL:
24725 return performMULLCombine(N, DCI, DAG);
24728 switch (N->getConstantOperandVal(1)) {
24729 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24730 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24731 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24732 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24733 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24734 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24735 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24736 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24737 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24738 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24739 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24740 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24741 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24742 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24743 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24744 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24746 case Intrinsic::aarch64_neon_ld2:
24747 case Intrinsic::aarch64_neon_ld3:
24748 case Intrinsic::aarch64_neon_ld4:
24749 case Intrinsic::aarch64_neon_ld1x2:
24750 case Intrinsic::aarch64_neon_ld1x3:
24751 case Intrinsic::aarch64_neon_ld1x4:
24752 case Intrinsic::aarch64_neon_ld2lane:
24753 case Intrinsic::aarch64_neon_ld3lane:
24754 case Intrinsic::aarch64_neon_ld4lane:
24755 case Intrinsic::aarch64_neon_ld2r:
24756 case Intrinsic::aarch64_neon_ld3r:
24757 case Intrinsic::aarch64_neon_ld4r:
24758 case Intrinsic::aarch64_neon_st2:
24759 case Intrinsic::aarch64_neon_st3:
24760 case Intrinsic::aarch64_neon_st4:
24761 case Intrinsic::aarch64_neon_st1x2:
24762 case Intrinsic::aarch64_neon_st1x3:
24763 case Intrinsic::aarch64_neon_st1x4:
24764 case Intrinsic::aarch64_neon_st2lane:
24765 case Intrinsic::aarch64_neon_st3lane:
24766 case Intrinsic::aarch64_neon_st4lane:
24767 return performNEONPostLDSTCombine(N, DCI, DAG);
24768 case Intrinsic::aarch64_sve_ldnt1:
24769 return performLDNT1Combine(N, DAG);
24770 case Intrinsic::aarch64_sve_ld1rq:
24771 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24772 case Intrinsic::aarch64_sve_ld1ro:
24773 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24774 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24776 case Intrinsic::aarch64_sve_ldnt1_gather:
24778 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24779 return performGatherLoadCombine(N, DAG,
24781 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24783 case Intrinsic::aarch64_sve_ld1:
24785 case Intrinsic::aarch64_sve_ldnf1:
24787 case Intrinsic::aarch64_sve_ldff1:
24789 case Intrinsic::aarch64_sve_st1:
24790 return performST1Combine(N, DAG);
24791 case Intrinsic::aarch64_sve_stnt1:
24792 return performSTNT1Combine(N, DAG);
24793 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24795 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24797 case Intrinsic::aarch64_sve_stnt1_scatter:
24799 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24801 case Intrinsic::aarch64_sve_ld1_gather:
24803 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24804 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24806 case Intrinsic::aarch64_sve_ld1q_gather_index:
24807 return performGatherLoadCombine(N, DAG,
24809 case Intrinsic::aarch64_sve_ld1_gather_index:
24810 return performGatherLoadCombine(N, DAG,
24812 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24814 /*OnlyPackedOffsets=*/false);
24815 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24817 /*OnlyPackedOffsets=*/false);
24818 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24819 return performGatherLoadCombine(N, DAG,
24821 /*OnlyPackedOffsets=*/false);
24822 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24823 return performGatherLoadCombine(N, DAG,
24825 /*OnlyPackedOffsets=*/false);
24826 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24828 case Intrinsic::aarch64_sve_ldff1_gather:
24830 case Intrinsic::aarch64_sve_ldff1_gather_index:
24831 return performGatherLoadCombine(N, DAG,
24833 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24834 return performGatherLoadCombine(N, DAG,
24836 /*OnlyPackedOffsets=*/false);
24837 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24838 return performGatherLoadCombine(N, DAG,
24840 /*OnlyPackedOffsets=*/false);
24841 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24842 return performGatherLoadCombine(N, DAG,
24844 /*OnlyPackedOffsets=*/false);
24845 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24846 return performGatherLoadCombine(N, DAG,
24848 /*OnlyPackedOffsets=*/false);
24849 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24850 return performGatherLoadCombine(N, DAG,
24852 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24853 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24855 case Intrinsic::aarch64_sve_st1q_scatter_index:
24857 case Intrinsic::aarch64_sve_st1_scatter:
24859 case Intrinsic::aarch64_sve_st1_scatter_index:
24861 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24863 /*OnlyPackedOffsets=*/false);
24864 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24866 /*OnlyPackedOffsets=*/false);
24867 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24868 return performScatterStoreCombine(N, DAG,
24870 /*OnlyPackedOffsets=*/false);
24871 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24872 return performScatterStoreCombine(N, DAG,
24874 /*OnlyPackedOffsets=*/false);
24875 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24877 case Intrinsic::aarch64_rndr:
24878 case Intrinsic::aarch64_rndrrs: {
24879 unsigned IntrinsicID = N->getConstantOperandVal(1);
24880 auto Register =
24881 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24882 : AArch64SysReg::RNDRRS);
24883 SDLoc DL(N);
24884 SDValue A = DAG.getNode(
24885 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24886 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24887 SDValue B = DAG.getNode(
24888 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24889 DAG.getConstant(0, DL, MVT::i32),
24890 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24891 return DAG.getMergeValues(
24892 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24893 }
24894 case Intrinsic::aarch64_sme_ldr_zt:
24896 DAG.getVTList(MVT::Other), N->getOperand(0),
24897 N->getOperand(2), N->getOperand(3));
24898 case Intrinsic::aarch64_sme_str_zt:
24899 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24900 DAG.getVTList(MVT::Other), N->getOperand(0),
24901 N->getOperand(2), N->getOperand(3));
24902 default:
24903 break;
24904 }
24905 break;
24906 case ISD::GlobalAddress:
24907 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24908 case ISD::CTLZ:
24909 return performCTLZCombine(N, DAG, Subtarget);
24911 return performScalarToVectorCombine(N, DCI, DAG);
24912 }
24913 return SDValue();
24914}
24915
24916// Check if the return value is used as only a return value, as otherwise
24917// we can't perform a tail-call. In particular, we need to check for
24918// target ISD nodes that are returns and any other "odd" constructs
24919// that the generic analysis code won't necessarily catch.
24920bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24921 SDValue &Chain) const {
24922 if (N->getNumValues() != 1)
24923 return false;
24924 if (!N->hasNUsesOfValue(1, 0))
24925 return false;
24926
24927 SDValue TCChain = Chain;
24928 SDNode *Copy = *N->use_begin();
24929 if (Copy->getOpcode() == ISD::CopyToReg) {
24930 // If the copy has a glue operand, we conservatively assume it isn't safe to
24931 // perform a tail call.
24932 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24933 MVT::Glue)
24934 return false;
24935 TCChain = Copy->getOperand(0);
24936 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24937 return false;
24938
24939 bool HasRet = false;
24940 for (SDNode *Node : Copy->uses()) {
24941 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24942 return false;
24943 HasRet = true;
24944 }
24945
24946 if (!HasRet)
24947 return false;
24948
24949 Chain = TCChain;
24950 return true;
24951}
24952
24953// Return whether the an instruction can potentially be optimized to a tail
24954// call. This will cause the optimizers to attempt to move, or duplicate,
24955// return instructions to help enable tail call optimizations for this
24956// instruction.
24957bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24958 return CI->isTailCall();
24959}
24960
24961bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24962 Register Offset, bool IsPre,
24963 MachineRegisterInfo &MRI) const {
24964 auto CstOffset = getIConstantVRegVal(Offset, MRI);
24965 if (!CstOffset || CstOffset->isZero())
24966 return false;
24967
24968 // All of the indexed addressing mode instructions take a signed 9 bit
24969 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24970 // encodes the sign/indexing direction.
24971 return isInt<9>(CstOffset->getSExtValue());
24972}
24973
24974bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24975 SDValue &Base,
24976 SDValue &Offset,
24977 SelectionDAG &DAG) const {
24978 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24979 return false;
24980
24981 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24982 SDNode *ValOnlyUser = nullptr;
24983 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24984 ++UI) {
24985 if (UI.getUse().getResNo() == 1)
24986 continue; // Ignore chain.
24987 if (ValOnlyUser == nullptr)
24988 ValOnlyUser = *UI;
24989 else {
24990 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24991 break;
24992 }
24993 }
24994
24995 auto IsUndefOrZero = [](SDValue V) {
24996 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24997 };
24998
24999 // If the only user of the value is a scalable vector splat, it is
25000 // preferable to do a replicating load (ld1r*).
25001 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
25002 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25003 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25004 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
25005 return false;
25006
25007 Base = Op->getOperand(0);
25008 // All of the indexed addressing mode instructions take a signed
25009 // 9 bit immediate offset.
25010 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
25011 int64_t RHSC = RHS->getSExtValue();
25012 if (Op->getOpcode() == ISD::SUB)
25013 RHSC = -(uint64_t)RHSC;
25014 if (!isInt<9>(RHSC))
25015 return false;
25016 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25017 // when dealing with subtraction.
25018 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25019 return true;
25020 }
25021 return false;
25022}
25023
25024bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25025 SDValue &Offset,
25027 SelectionDAG &DAG) const {
25028 EVT VT;
25029 SDValue Ptr;
25030 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25031 VT = LD->getMemoryVT();
25032 Ptr = LD->getBasePtr();
25033 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25034 VT = ST->getMemoryVT();
25035 Ptr = ST->getBasePtr();
25036 } else
25037 return false;
25038
25039 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25040 return false;
25041 AM = ISD::PRE_INC;
25042 return true;
25043}
25044
25045bool AArch64TargetLowering::getPostIndexedAddressParts(
25047 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25048 EVT VT;
25049 SDValue Ptr;
25050 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25051 VT = LD->getMemoryVT();
25052 Ptr = LD->getBasePtr();
25053 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25054 VT = ST->getMemoryVT();
25055 Ptr = ST->getBasePtr();
25056 } else
25057 return false;
25058
25059 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25060 return false;
25061 // Post-indexing updates the base, so it's not a valid transform
25062 // if that's not the same as the load's pointer.
25063 if (Ptr != Base)
25064 return false;
25065 AM = ISD::POST_INC;
25066 return true;
25067}
25068
25071 SelectionDAG &DAG) {
25072 SDLoc DL(N);
25073 SDValue Op = N->getOperand(0);
25074 EVT VT = N->getValueType(0);
25075 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25076 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25077 "Must be bool vector.");
25078
25079 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25080 // elements, it adds a vector concatenation with undef(s). If we encounter
25081 // this here, we can skip the concat.
25082 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25083 bool AllUndef = true;
25084 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25085 AllUndef &= Op.getOperand(I).isUndef();
25086
25087 if (AllUndef)
25088 Op = Op.getOperand(0);
25089 }
25090
25091 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25092 if (VectorBits)
25093 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25094}
25095
25098 SelectionDAG &DAG, EVT ExtendVT,
25099 EVT CastVT) {
25100 SDLoc DL(N);
25101 SDValue Op = N->getOperand(0);
25102 EVT VT = N->getValueType(0);
25103
25104 // Use SCALAR_TO_VECTOR for lane zero
25105 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25106 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25107 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25108 Results.push_back(
25109 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25110}
25111
25112void AArch64TargetLowering::ReplaceBITCASTResults(
25114 SDLoc DL(N);
25115 SDValue Op = N->getOperand(0);
25116 EVT VT = N->getValueType(0);
25117 EVT SrcVT = Op.getValueType();
25118
25119 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25120 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25121 return;
25122 }
25123
25124 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25125 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25126 return;
25127 }
25128
25129 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25130 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25131 return;
25132 }
25133
25134 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25135 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25136 "Expected fp->int bitcast!");
25137
25138 // Bitcasting between unpacked vector types of different element counts is
25139 // not a NOP because the live elements are laid out differently.
25140 // 01234567
25141 // e.g. nxv2i32 = XX??XX??
25142 // nxv4f16 = X?X?X?X?
25143 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25144 return;
25145
25146 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25147 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25148 return;
25149 }
25150
25151 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25152 !VT.isVector())
25153 return replaceBoolVectorBitcast(N, Results, DAG);
25154
25155 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25156 return;
25157
25158 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25159 DAG.getUNDEF(MVT::i32), Op);
25160 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25161 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25162}
25163
25165 SelectionDAG &DAG,
25166 const AArch64Subtarget *Subtarget) {
25167 EVT VT = N->getValueType(0);
25168 if (!VT.is256BitVector() ||
25170 !N->getFlags().hasAllowReassociation()) ||
25171 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25172 VT.getScalarType() == MVT::bf16)
25173 return;
25174
25175 SDValue X = N->getOperand(0);
25176 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25177 if (!Shuf) {
25178 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25179 X = N->getOperand(1);
25180 if (!Shuf)
25181 return;
25182 }
25183
25184 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25185 return;
25186
25187 // Check the mask is 1,0,3,2,5,4,...
25188 ArrayRef<int> Mask = Shuf->getMask();
25189 for (int I = 0, E = Mask.size(); I < E; I++)
25190 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25191 return;
25192
25193 SDLoc DL(N);
25194 auto LoHi = DAG.SplitVector(X, DL);
25195 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25196 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25197 LoHi.first, LoHi.second);
25198
25199 // Shuffle the elements back into order.
25200 SmallVector<int> NMask;
25201 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25202 NMask.push_back(I);
25203 NMask.push_back(I);
25204 }
25205 Results.push_back(
25206 DAG.getVectorShuffle(VT, DL,
25207 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25208 DAG.getUNDEF(LoHi.first.getValueType())),
25209 DAG.getUNDEF(VT), NMask));
25210}
25211
25214 SelectionDAG &DAG, unsigned InterOp,
25215 unsigned AcrossOp) {
25216 EVT LoVT, HiVT;
25217 SDValue Lo, Hi;
25218 SDLoc dl(N);
25219 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25220 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25221 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25222 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25223 Results.push_back(SplitVal);
25224}
25225
25226void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25228 SDValue In = N->getOperand(0);
25229 EVT InVT = In.getValueType();
25230
25231 // Common code will handle these just fine.
25232 if (!InVT.isScalableVector() || !InVT.isInteger())
25233 return;
25234
25235 SDLoc DL(N);
25236 EVT VT = N->getValueType(0);
25237
25238 // The following checks bail if this is not a halving operation.
25239
25241
25242 if (InVT.getVectorElementCount() != (ResEC * 2))
25243 return;
25244
25245 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25246 if (!CIndex)
25247 return;
25248
25249 unsigned Index = CIndex->getZExtValue();
25250 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25251 return;
25252
25253 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25254 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25255
25256 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25257 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25258}
25259
25260// Create an even/odd pair of X registers holding integer value V.
25262 SDLoc dl(V.getNode());
25263 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25264 if (DAG.getDataLayout().isBigEndian())
25265 std::swap (VLo, VHi);
25266 SDValue RegClass =
25267 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25268 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25269 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25270 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25271 return SDValue(
25272 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25273}
25274
25277 SelectionDAG &DAG,
25278 const AArch64Subtarget *Subtarget) {
25279 assert(N->getValueType(0) == MVT::i128 &&
25280 "AtomicCmpSwap on types less than 128 should be legal");
25281
25282 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25283 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25284 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25285 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25286 SDValue Ops[] = {
25287 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25288 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25289 N->getOperand(1), // Ptr
25290 N->getOperand(0), // Chain in
25291 };
25292
25293 unsigned Opcode;
25294 switch (MemOp->getMergedOrdering()) {
25296 Opcode = AArch64::CASPX;
25297 break;
25299 Opcode = AArch64::CASPAX;
25300 break;
25302 Opcode = AArch64::CASPLX;
25303 break;
25306 Opcode = AArch64::CASPALX;
25307 break;
25308 default:
25309 llvm_unreachable("Unexpected ordering!");
25310 }
25311
25312 MachineSDNode *CmpSwap = DAG.getMachineNode(
25313 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25314 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25315
25316 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25317 if (DAG.getDataLayout().isBigEndian())
25318 std::swap(SubReg1, SubReg2);
25319 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25320 SDValue(CmpSwap, 0));
25321 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25322 SDValue(CmpSwap, 0));
25323 Results.push_back(
25324 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25325 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25326 return;
25327 }
25328
25329 unsigned Opcode;
25330 switch (MemOp->getMergedOrdering()) {
25332 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25333 break;
25335 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25336 break;
25338 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25339 break;
25342 Opcode = AArch64::CMP_SWAP_128;
25343 break;
25344 default:
25345 llvm_unreachable("Unexpected ordering!");
25346 }
25347
25348 SDLoc DL(N);
25349 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25350 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25351 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25352 New.first, New.second, N->getOperand(0)};
25353 SDNode *CmpSwap = DAG.getMachineNode(
25354 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25355 Ops);
25356 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25357
25358 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25359 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25360 Results.push_back(SDValue(CmpSwap, 3));
25361}
25362
25363static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25364 AtomicOrdering Ordering) {
25365 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25366 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25367 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25368 // ATOMIC_LOAD_CLR at any point.
25369 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25370 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25371 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25372 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25373
25374 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25375 // The operand will need to be XORed in a separate step.
25376 switch (Ordering) {
25378 return AArch64::LDCLRP;
25379 break;
25381 return AArch64::LDCLRPA;
25382 break;
25384 return AArch64::LDCLRPL;
25385 break;
25388 return AArch64::LDCLRPAL;
25389 break;
25390 default:
25391 llvm_unreachable("Unexpected ordering!");
25392 }
25393 }
25394
25395 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25396 switch (Ordering) {
25398 return AArch64::LDSETP;
25399 break;
25401 return AArch64::LDSETPA;
25402 break;
25404 return AArch64::LDSETPL;
25405 break;
25408 return AArch64::LDSETPAL;
25409 break;
25410 default:
25411 llvm_unreachable("Unexpected ordering!");
25412 }
25413 }
25414
25415 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25416 switch (Ordering) {
25418 return AArch64::SWPP;
25419 break;
25421 return AArch64::SWPPA;
25422 break;
25424 return AArch64::SWPPL;
25425 break;
25428 return AArch64::SWPPAL;
25429 break;
25430 default:
25431 llvm_unreachable("Unexpected ordering!");
25432 }
25433 }
25434
25435 llvm_unreachable("Unexpected ISDOpcode!");
25436}
25437
25440 SelectionDAG &DAG,
25441 const AArch64Subtarget *Subtarget) {
25442 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25443 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25444 // rather than the CASP instructions, because CASP has register classes for
25445 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25446 // to present them as single operands. LSE128 instructions use the GPR64
25447 // register class (because the pair does not have to be sequential), like
25448 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25449
25450 assert(N->getValueType(0) == MVT::i128 &&
25451 "AtomicLoadXXX on types less than 128 should be legal");
25452
25453 if (!Subtarget->hasLSE128())
25454 return;
25455
25456 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25457 const SDValue &Chain = N->getOperand(0);
25458 const SDValue &Ptr = N->getOperand(1);
25459 const SDValue &Val128 = N->getOperand(2);
25460 std::pair<SDValue, SDValue> Val2x64 =
25461 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25462
25463 const unsigned ISDOpcode = N->getOpcode();
25464 const unsigned MachineOpcode =
25465 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25466
25467 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25468 SDLoc dl(Val128);
25469 Val2x64.first =
25470 DAG.getNode(ISD::XOR, dl, MVT::i64,
25471 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25472 Val2x64.second =
25473 DAG.getNode(ISD::XOR, dl, MVT::i64,
25474 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25475 }
25476
25477 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25478 if (DAG.getDataLayout().isBigEndian())
25479 std::swap(Ops[0], Ops[1]);
25480
25481 MachineSDNode *AtomicInst =
25482 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25483 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25484
25485 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25486
25487 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25488 if (DAG.getDataLayout().isBigEndian())
25489 std::swap(Lo, Hi);
25490
25491 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25492 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25493}
25494
25495void AArch64TargetLowering::ReplaceNodeResults(
25497 switch (N->getOpcode()) {
25498 default:
25499 llvm_unreachable("Don't know how to custom expand this");
25500 case ISD::BITCAST:
25501 ReplaceBITCASTResults(N, Results, DAG);
25502 return;
25503 case ISD::VECREDUCE_ADD:
25508 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25509 return;
25510 case ISD::ADD:
25511 case ISD::FADD:
25512 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25513 return;
25514
25515 case ISD::CTPOP:
25516 case ISD::PARITY:
25517 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25518 Results.push_back(Result);
25519 return;
25520 case AArch64ISD::SADDV:
25522 return;
25523 case AArch64ISD::UADDV:
25525 return;
25526 case AArch64ISD::SMINV:
25528 return;
25529 case AArch64ISD::UMINV:
25531 return;
25532 case AArch64ISD::SMAXV:
25534 return;
25535 case AArch64ISD::UMAXV:
25537 return;
25538 case ISD::MULHS:
25540 Results.push_back(
25541 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25542 return;
25543 case ISD::MULHU:
25545 Results.push_back(
25546 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25547 return;
25548 case ISD::FP_TO_UINT:
25549 case ISD::FP_TO_SINT:
25552 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25553 // Let normal code take care of it by not adding anything to Results.
25554 return;
25556 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25557 return;
25559 assert(N->getValueType(0) != MVT::i128 &&
25560 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25561 break;
25564 case ISD::ATOMIC_SWAP: {
25565 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25566 "Expected 128-bit atomicrmw.");
25567 // These need custom type legalisation so we go directly to instruction.
25568 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25569 return;
25570 }
25571 case ISD::ATOMIC_LOAD:
25572 case ISD::LOAD: {
25573 MemSDNode *LoadNode = cast<MemSDNode>(N);
25574 EVT MemVT = LoadNode->getMemoryVT();
25575 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25576 // targets.
25577 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25578 MemVT.getSizeInBits() == 256u &&
25579 (MemVT.getScalarSizeInBits() == 8u ||
25580 MemVT.getScalarSizeInBits() == 16u ||
25581 MemVT.getScalarSizeInBits() == 32u ||
25582 MemVT.getScalarSizeInBits() == 64u)) {
25583
25586 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25587 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25588 MVT::Other}),
25589 {LoadNode->getChain(), LoadNode->getBasePtr()},
25590 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25591
25592 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25593 Result.getValue(0), Result.getValue(1));
25594 Results.append({Pair, Result.getValue(2) /* Chain */});
25595 return;
25596 }
25597
25598 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25599 LoadNode->getMemoryVT() != MVT::i128) {
25600 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25601 // optimizer.
25602 return;
25603 }
25604
25605 if (SDValue(N, 0).getValueType() == MVT::i128) {
25606 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25607 bool isLoadAcquire =
25609 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25610
25611 if (isLoadAcquire)
25612 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25613
25615 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25616 {LoadNode->getChain(), LoadNode->getBasePtr()},
25617 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25618
25619 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25620
25621 SDValue Pair =
25622 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25623 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25624 Results.append({Pair, Result.getValue(2) /* Chain */});
25625 }
25626 return;
25627 }
25629 ReplaceExtractSubVectorResults(N, Results, DAG);
25630 return;
25633 // Custom lowering has been requested for INSERT_SUBVECTOR and
25634 // CONCAT_VECTORS -- but delegate to common code for result type
25635 // legalisation
25636 return;
25638 EVT VT = N->getValueType(0);
25639 assert((VT == MVT::i8 || VT == MVT::i16) &&
25640 "custom lowering for unexpected type");
25641
25642 Intrinsic::ID IntID =
25643 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25644 switch (IntID) {
25645 default:
25646 return;
25647 case Intrinsic::aarch64_sve_clasta_n: {
25648 SDLoc DL(N);
25649 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25650 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25651 N->getOperand(1), Op2, N->getOperand(3));
25652 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25653 return;
25654 }
25655 case Intrinsic::aarch64_sve_clastb_n: {
25656 SDLoc DL(N);
25657 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25658 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25659 N->getOperand(1), Op2, N->getOperand(3));
25660 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25661 return;
25662 }
25663 case Intrinsic::aarch64_sve_lasta: {
25664 SDLoc DL(N);
25665 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25666 N->getOperand(1), N->getOperand(2));
25667 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25668 return;
25669 }
25670 case Intrinsic::aarch64_sve_lastb: {
25671 SDLoc DL(N);
25672 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25673 N->getOperand(1), N->getOperand(2));
25674 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25675 return;
25676 }
25677 }
25678 }
25679 case ISD::READ_REGISTER: {
25680 SDLoc DL(N);
25681 assert(N->getValueType(0) == MVT::i128 &&
25682 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25683 SDValue Chain = N->getOperand(0);
25684 SDValue SysRegName = N->getOperand(1);
25685
25686 SDValue Result = DAG.getNode(
25687 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25688 Chain, SysRegName);
25689
25690 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25691 // of the 128-bit System Register value.
25692 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25693 Result.getValue(0), Result.getValue(1));
25694 Results.push_back(Pair);
25695 Results.push_back(Result.getValue(2)); // Chain
25696 return;
25697 }
25698 }
25699}
25700
25702 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25704 return true;
25705}
25706
25707unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25708 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25709 // reciprocal if there are three or more FDIVs.
25710 return 3;
25711}
25712
25715 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25716 // v4i16, v2i32 instead of to promote.
25717 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25718 VT == MVT::v1f32)
25719 return TypeWidenVector;
25720
25722}
25723
25724// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25725// provided the address is 16-byte aligned.
25727 if (!Subtarget->hasLSE2())
25728 return false;
25729
25730 if (auto LI = dyn_cast<LoadInst>(I))
25731 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25732 LI->getAlign() >= Align(16);
25733
25734 if (auto SI = dyn_cast<StoreInst>(I))
25735 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25736 SI->getAlign() >= Align(16);
25737
25738 return false;
25739}
25740
25742 if (!Subtarget->hasLSE128())
25743 return false;
25744
25745 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25746 // will clobber the two registers.
25747 if (const auto *SI = dyn_cast<StoreInst>(I))
25748 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25749 SI->getAlign() >= Align(16) &&
25750 (SI->getOrdering() == AtomicOrdering::Release ||
25751 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25752
25753 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25754 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25755 RMW->getAlign() >= Align(16) &&
25756 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25757 RMW->getOperation() == AtomicRMWInst::And ||
25758 RMW->getOperation() == AtomicRMWInst::Or);
25759
25760 return false;
25761}
25762
25764 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25765 return false;
25766
25767 if (auto LI = dyn_cast<LoadInst>(I))
25768 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25769 LI->getAlign() >= Align(16) &&
25770 LI->getOrdering() == AtomicOrdering::Acquire;
25771
25772 if (auto SI = dyn_cast<StoreInst>(I))
25773 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25774 SI->getAlign() >= Align(16) &&
25775 SI->getOrdering() == AtomicOrdering::Release;
25776
25777 return false;
25778}
25779
25781 const Instruction *I) const {
25783 return false;
25785 return false;
25787 return true;
25788 return false;
25789}
25790
25792 const Instruction *I) const {
25793 // Store-Release instructions only provide seq_cst guarantees when paired with
25794 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25795 // implement seq_cst loads and stores, so we need additional explicit fences
25796 // after memory writes.
25797 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25798 return false;
25799
25800 switch (I->getOpcode()) {
25801 default:
25802 return false;
25803 case Instruction::AtomicCmpXchg:
25804 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25806 case Instruction::AtomicRMW:
25807 return cast<AtomicRMWInst>(I)->getOrdering() ==
25809 case Instruction::Store:
25810 return cast<StoreInst>(I)->getOrdering() ==
25812 }
25813}
25814
25815// Loads and stores less than 128-bits are already atomic; ones above that
25816// are doomed anyway, so defer to the default libcall and blame the OS when
25817// things go wrong.
25820 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25821 if (Size != 128)
25823 if (isOpSuitableForRCPC3(SI))
25825 if (isOpSuitableForLSE128(SI))
25827 if (isOpSuitableForLDPSTP(SI))
25830}
25831
25832// Loads and stores less than 128-bits are already atomic; ones above that
25833// are doomed anyway, so defer to the default libcall and blame the OS when
25834// things go wrong.
25837 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25838
25839 if (Size != 128)
25841 if (isOpSuitableForRCPC3(LI))
25843 // No LSE128 loads
25844 if (isOpSuitableForLDPSTP(LI))
25846
25847 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25848 // implement atomicrmw without spilling. If the target address is also on the
25849 // stack and close enough to the spill slot, this can lead to a situation
25850 // where the monitor always gets cleared and the atomic operation can never
25851 // succeed. So at -O0 lower this operation to a CAS loop.
25852 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25854
25855 // Using CAS for an atomic load has a better chance of succeeding under high
25856 // contention situations. So use it if available.
25857 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25859}
25860
25861// The "default" for integer RMW operations is to expand to an LL/SC loop.
25862// However, with the LSE instructions (or outline-atomics mode, which provides
25863// library routines in place of the LSE-instructions), we can directly emit many
25864// operations instead.
25865//
25866// Floating-point operations are always emitted to a cmpxchg loop, because they
25867// may trigger a trap which aborts an LLSC sequence.
25870 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25871 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25872
25873 if (AI->isFloatingPointOperation())
25875
25876 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25880 if (CanUseLSE128)
25882
25883 // Nand is not supported in LSE.
25884 // Leave 128 bits to LLSC or CmpXChg.
25885 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25886 if (Subtarget->hasLSE())
25888 if (Subtarget->outlineAtomics()) {
25889 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25890 // Don't outline them unless
25891 // (1) high level <atomic> support approved:
25892 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25893 // (2) low level libgcc and compiler-rt support implemented by:
25894 // min/max outline atomics helpers
25895 if (AI->getOperation() != AtomicRMWInst::Min &&
25900 }
25901 }
25902 }
25903
25904 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25905 // implement atomicrmw without spilling. If the target address is also on the
25906 // stack and close enough to the spill slot, this can lead to a situation
25907 // where the monitor always gets cleared and the atomic operation can never
25908 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25909 // we have a single CAS instruction that can replace the loop.
25911 Subtarget->hasLSE())
25913
25915}
25916
25919 AtomicCmpXchgInst *AI) const {
25920 // If subtarget has LSE, leave cmpxchg intact for codegen.
25921 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25923 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25924 // implement cmpxchg without spilling. If the address being exchanged is also
25925 // on the stack and close enough to the spill slot, this can lead to a
25926 // situation where the monitor always gets cleared and the atomic operation
25927 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25928 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25930
25931 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25932 // it.
25934 if (Size > 64)
25936
25938}
25939
25941 Type *ValueTy, Value *Addr,
25942 AtomicOrdering Ord) const {
25943 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25944 bool IsAcquire = isAcquireOrStronger(Ord);
25945
25946 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25947 // intrinsic must return {i64, i64} and we have to recombine them into a
25948 // single i128 here.
25949 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25951 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25953
25954 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25955
25956 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25957 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25958 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25959 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25960 return Builder.CreateOr(
25961 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25962 }
25963
25964 Type *Tys[] = { Addr->getType() };
25966 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25967 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25968
25969 const DataLayout &DL = M->getDataLayout();
25970 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25971 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25972 CI->addParamAttr(
25973 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25974 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25975
25976 return Builder.CreateBitCast(Trunc, ValueTy);
25977}
25978
25980 IRBuilderBase &Builder) const {
25981 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25982 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25983}
25984
25986 Value *Val, Value *Addr,
25987 AtomicOrdering Ord) const {
25988 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25989 bool IsRelease = isReleaseOrStronger(Ord);
25990
25991 // Since the intrinsics must have legal type, the i128 intrinsics take two
25992 // parameters: "i64, i64". We must marshal Val into the appropriate form
25993 // before the call.
25994 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25996 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25998 Type *Int64Ty = Type::getInt64Ty(M->getContext());
25999
26000 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26001 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26002 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26003 }
26004
26006 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26007 Type *Tys[] = { Addr->getType() };
26008 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26009
26010 const DataLayout &DL = M->getDataLayout();
26011 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26012 Val = Builder.CreateBitCast(Val, IntValTy);
26013
26014 CallInst *CI = Builder.CreateCall(
26015 Stxr, {Builder.CreateZExtOrBitCast(
26016 Val, Stxr->getFunctionType()->getParamType(0)),
26017 Addr});
26018 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26019 Attribute::ElementType, Val->getType()));
26020 return CI;
26021}
26022
26024 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26025 const DataLayout &DL) const {
26026 if (!Ty->isArrayTy()) {
26027 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26028 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26029 }
26030
26031 // All non aggregate members of the type must have the same type
26032 SmallVector<EVT> ValueVTs;
26033 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26034 return all_equal(ValueVTs);
26035}
26036
26037bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26038 EVT) const {
26039 return false;
26040}
26041
26042static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26043 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26044 Function *ThreadPointerFunc =
26045 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26046 return IRB.CreatePointerCast(
26047 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26048 Offset),
26049 IRB.getPtrTy(0));
26050}
26051
26053 // Android provides a fixed TLS slot for the stack cookie. See the definition
26054 // of TLS_SLOT_STACK_GUARD in
26055 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26056 if (Subtarget->isTargetAndroid())
26057 return UseTlsOffset(IRB, 0x28);
26058
26059 // Fuchsia is similar.
26060 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26061 if (Subtarget->isTargetFuchsia())
26062 return UseTlsOffset(IRB, -0x10);
26063
26065}
26066
26068 // MSVC CRT provides functionalities for stack protection.
26069 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26070 // MSVC CRT has a global variable holding security cookie.
26071 M.getOrInsertGlobal("__security_cookie",
26072 PointerType::getUnqual(M.getContext()));
26073
26074 // MSVC CRT has a function to validate security cookie.
26075 FunctionCallee SecurityCheckCookie =
26076 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26077 Type::getVoidTy(M.getContext()),
26078 PointerType::getUnqual(M.getContext()));
26079 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26080 F->setCallingConv(CallingConv::Win64);
26081 F->addParamAttr(0, Attribute::AttrKind::InReg);
26082 }
26083 return;
26084 }
26086}
26087
26089 // MSVC CRT has a global variable holding security cookie.
26090 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26091 return M.getGlobalVariable("__security_cookie");
26093}
26094
26096 // MSVC CRT has a function to validate security cookie.
26097 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26098 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26100}
26101
26102Value *
26104 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26105 // definition of TLS_SLOT_SAFESTACK in
26106 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26107 if (Subtarget->isTargetAndroid())
26108 return UseTlsOffset(IRB, 0x48);
26109
26110 // Fuchsia is similar.
26111 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26112 if (Subtarget->isTargetFuchsia())
26113 return UseTlsOffset(IRB, -0x8);
26114
26116}
26117
26119 const Instruction &AndI) const {
26120 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26121 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26122 // may be beneficial to sink in other cases, but we would have to check that
26123 // the cmp would not get folded into the br to form a cbz for these to be
26124 // beneficial.
26125 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26126 if (!Mask)
26127 return false;
26128 return Mask->getValue().isPowerOf2();
26129}
26130
26134 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26135 SelectionDAG &DAG) const {
26136 // Does baseline recommend not to perform the fold by default?
26138 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26139 return false;
26140 // Else, if this is a vector shift, prefer 'shl'.
26141 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26142}
26143
26146 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26148 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26151 ExpansionFactor);
26152}
26153
26155 // Update IsSplitCSR in AArch64unctionInfo.
26156 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26157 AFI->setIsSplitCSR(true);
26158}
26159
26161 MachineBasicBlock *Entry,
26162 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26163 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26164 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26165 if (!IStart)
26166 return;
26167
26168 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26169 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26170 MachineBasicBlock::iterator MBBI = Entry->begin();
26171 for (const MCPhysReg *I = IStart; *I; ++I) {
26172 const TargetRegisterClass *RC = nullptr;
26173 if (AArch64::GPR64RegClass.contains(*I))
26174 RC = &AArch64::GPR64RegClass;
26175 else if (AArch64::FPR64RegClass.contains(*I))
26176 RC = &AArch64::FPR64RegClass;
26177 else
26178 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26179
26180 Register NewVR = MRI->createVirtualRegister(RC);
26181 // Create copy from CSR to a virtual register.
26182 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26183 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26184 // nounwind. If we want to generalize this later, we may need to emit
26185 // CFI pseudo-instructions.
26186 assert(Entry->getParent()->getFunction().hasFnAttribute(
26187 Attribute::NoUnwind) &&
26188 "Function should be nounwind in insertCopiesSplitCSR!");
26189 Entry->addLiveIn(*I);
26190 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26191 .addReg(*I);
26192
26193 // Insert the copy-back instructions right before the terminator.
26194 for (auto *Exit : Exits)
26195 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26196 TII->get(TargetOpcode::COPY), *I)
26197 .addReg(NewVR);
26198 }
26199}
26200
26202 // Integer division on AArch64 is expensive. However, when aggressively
26203 // optimizing for code size, we prefer to use a div instruction, as it is
26204 // usually smaller than the alternative sequence.
26205 // The exception to this is vector division. Since AArch64 doesn't have vector
26206 // integer division, leaving the division as-is is a loss even in terms of
26207 // size, because it will have to be scalarized, while the alternative code
26208 // sequence can be performed in vector form.
26209 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26210 return OptSize && !VT.isVector();
26211}
26212
26214 // We want inc-of-add for scalars and sub-of-not for vectors.
26215 return VT.isScalarInteger();
26216}
26217
26219 EVT VT) const {
26220 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26221 // legalize.
26222 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26223 return false;
26224 if (FPVT == MVT::v8bf16)
26225 return false;
26226 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26227}
26228
26232 const TargetInstrInfo *TII) const {
26233 assert(MBBI->isCall() && MBBI->getCFIType() &&
26234 "Invalid call instruction for a KCFI check");
26235
26236 switch (MBBI->getOpcode()) {
26237 case AArch64::BLR:
26238 case AArch64::BLRNoIP:
26239 case AArch64::TCRETURNri:
26240 case AArch64::TCRETURNrix16x17:
26241 case AArch64::TCRETURNrix17:
26242 case AArch64::TCRETURNrinotx16:
26243 break;
26244 default:
26245 llvm_unreachable("Unexpected CFI call opcode");
26246 }
26247
26248 MachineOperand &Target = MBBI->getOperand(0);
26249 assert(Target.isReg() && "Invalid target operand for an indirect call");
26250 Target.setIsRenamable(false);
26251
26252 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26253 .addReg(Target.getReg())
26254 .addImm(MBBI->getCFIType())
26255 .getInstr();
26256}
26257
26259 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26260}
26261
26262unsigned
26264 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26265 return getPointerTy(DL).getSizeInBits();
26266
26267 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26268}
26269
26270void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26271 MachineFrameInfo &MFI = MF.getFrameInfo();
26272 // If we have any vulnerable SVE stack objects then the stack protector
26273 // needs to be placed at the top of the SVE stack area, as the SVE locals
26274 // are placed above the other locals, so we allocate it as if it were a
26275 // scalable vector.
26276 // FIXME: It may be worthwhile having a specific interface for this rather
26277 // than doing it here in finalizeLowering.
26278 if (MFI.hasStackProtectorIndex()) {
26279 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26285 break;
26286 }
26287 }
26288 }
26291}
26292
26293// Unlike X86, we let frame lowering assign offsets to all catch objects.
26295 return false;
26296}
26297
26298bool AArch64TargetLowering::shouldLocalize(
26299 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26300 auto &MF = *MI.getMF();
26301 auto &MRI = MF.getRegInfo();
26302 auto maxUses = [](unsigned RematCost) {
26303 // A cost of 1 means remats are basically free.
26304 if (RematCost == 1)
26305 return std::numeric_limits<unsigned>::max();
26306 if (RematCost == 2)
26307 return 2U;
26308
26309 // Remat is too expensive, only sink if there's one user.
26310 if (RematCost > 2)
26311 return 1U;
26312 llvm_unreachable("Unexpected remat cost");
26313 };
26314
26315 unsigned Opc = MI.getOpcode();
26316 switch (Opc) {
26317 case TargetOpcode::G_GLOBAL_VALUE: {
26318 // On Darwin, TLS global vars get selected into function calls, which
26319 // we don't want localized, as they can get moved into the middle of a
26320 // another call sequence.
26321 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26322 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26323 return false;
26324 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26325 }
26326 case TargetOpcode::G_FCONSTANT:
26327 case TargetOpcode::G_CONSTANT: {
26328 const ConstantInt *CI;
26329 unsigned AdditionalCost = 0;
26330
26331 if (Opc == TargetOpcode::G_CONSTANT)
26332 CI = MI.getOperand(1).getCImm();
26333 else {
26334 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26335 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26336 // materialized as integers.
26337 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26338 break;
26339 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26340 bool OptForSize =
26343 OptForSize))
26344 return true; // Constant should be cheap.
26345 CI =
26346 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26347 // FP materialization also costs an extra move, from gpr to fpr.
26348 AdditionalCost = 1;
26349 }
26350 APInt Imm = CI->getValue();
26353 assert(Cost.isValid() && "Expected a valid imm cost");
26354
26355 unsigned RematCost = *Cost.getValue();
26356 RematCost += AdditionalCost;
26357 Register Reg = MI.getOperand(0).getReg();
26358 unsigned MaxUses = maxUses(RematCost);
26359 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26360 if (MaxUses == std::numeric_limits<unsigned>::max())
26361 --MaxUses;
26362 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26363 }
26364 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26365 // localizable.
26366 case AArch64::ADRP:
26367 case AArch64::G_ADD_LOW:
26368 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26369 case TargetOpcode::G_PTR_ADD:
26370 return true;
26371 default:
26372 break;
26373 }
26375}
26376
26378 if (Inst.getType()->isScalableTy())
26379 return true;
26380
26381 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26382 if (Inst.getOperand(i)->getType()->isScalableTy())
26383 return true;
26384
26385 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26386 if (AI->getAllocatedType()->isScalableTy())
26387 return true;
26388 }
26389
26390 // Checks to allow the use of SME instructions
26391 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26392 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26393 auto CalleeAttrs = SMEAttrs(*Base);
26394 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26395 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26396 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26397 return true;
26398 }
26399 return false;
26400}
26401
26402// Return the largest legal scalable vector type that matches VT's element type.
26406 "Expected legal fixed length vector!");
26407 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26408 default:
26409 llvm_unreachable("unexpected element type for SVE container");
26410 case MVT::i8:
26411 return EVT(MVT::nxv16i8);
26412 case MVT::i16:
26413 return EVT(MVT::nxv8i16);
26414 case MVT::i32:
26415 return EVT(MVT::nxv4i32);
26416 case MVT::i64:
26417 return EVT(MVT::nxv2i64);
26418 case MVT::bf16:
26419 return EVT(MVT::nxv8bf16);
26420 case MVT::f16:
26421 return EVT(MVT::nxv8f16);
26422 case MVT::f32:
26423 return EVT(MVT::nxv4f32);
26424 case MVT::f64:
26425 return EVT(MVT::nxv2f64);
26426 }
26427}
26428
26429// Return a PTRUE with active lanes corresponding to the extent of VT.
26431 EVT VT) {
26434 "Expected legal fixed length vector!");
26435
26436 std::optional<unsigned> PgPattern =
26438 assert(PgPattern && "Unexpected element count for SVE predicate");
26439
26440 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26441 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26442 // variants of instructions when available.
26443 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26444 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26445 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26446 if (MaxSVESize && MinSVESize == MaxSVESize &&
26447 MaxSVESize == VT.getSizeInBits())
26448 PgPattern = AArch64SVEPredPattern::all;
26449
26450 MVT MaskVT;
26451 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26452 default:
26453 llvm_unreachable("unexpected element type for SVE predicate");
26454 case MVT::i8:
26455 MaskVT = MVT::nxv16i1;
26456 break;
26457 case MVT::i16:
26458 case MVT::f16:
26459 case MVT::bf16:
26460 MaskVT = MVT::nxv8i1;
26461 break;
26462 case MVT::i32:
26463 case MVT::f32:
26464 MaskVT = MVT::nxv4i1;
26465 break;
26466 case MVT::i64:
26467 case MVT::f64:
26468 MaskVT = MVT::nxv2i1;
26469 break;
26470 }
26471
26472 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26473}
26474
26476 EVT VT) {
26478 "Expected legal scalable vector!");
26479 auto PredTy = VT.changeVectorElementType(MVT::i1);
26480 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26481}
26482
26484 if (VT.isFixedLengthVector())
26485 return getPredicateForFixedLengthVector(DAG, DL, VT);
26486
26487 return getPredicateForScalableVector(DAG, DL, VT);
26488}
26489
26490// Grow V to consume an entire SVE register.
26492 assert(VT.isScalableVector() &&
26493 "Expected to convert into a scalable vector!");
26494 assert(V.getValueType().isFixedLengthVector() &&
26495 "Expected a fixed length vector operand!");
26496 SDLoc DL(V);
26497 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26498 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26499}
26500
26501// Shrink V so it's just big enough to maintain a VT's worth of data.
26504 "Expected to convert into a fixed length vector!");
26505 assert(V.getValueType().isScalableVector() &&
26506 "Expected a scalable vector operand!");
26507 SDLoc DL(V);
26508 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26509 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26510}
26511
26512// Convert all fixed length vector loads larger than NEON to masked_loads.
26513SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26514 SDValue Op, SelectionDAG &DAG) const {
26515 auto Load = cast<LoadSDNode>(Op);
26516
26517 SDLoc DL(Op);
26518 EVT VT = Op.getValueType();
26519 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26520 EVT LoadVT = ContainerVT;
26521 EVT MemVT = Load->getMemoryVT();
26522
26523 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26524
26525 if (VT.isFloatingPoint()) {
26526 LoadVT = ContainerVT.changeTypeToInteger();
26527 MemVT = MemVT.changeTypeToInteger();
26528 }
26529
26530 SDValue NewLoad = DAG.getMaskedLoad(
26531 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26532 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26533 Load->getAddressingMode(), Load->getExtensionType());
26534
26535 SDValue Result = NewLoad;
26536 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26537 EVT ExtendVT = ContainerVT.changeVectorElementType(
26538 Load->getMemoryVT().getVectorElementType());
26539
26540 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26542 Pg, Result, DAG.getUNDEF(ContainerVT));
26543 } else if (VT.isFloatingPoint()) {
26544 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26545 }
26546
26547 Result = convertFromScalableVector(DAG, VT, Result);
26548 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26549 return DAG.getMergeValues(MergedValues, DL);
26550}
26551
26553 SelectionDAG &DAG) {
26554 SDLoc DL(Mask);
26555 EVT InVT = Mask.getValueType();
26556 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26557
26558 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26559
26560 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26561 return Pg;
26562
26563 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26564 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26565
26567 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26568}
26569
26570// Convert all fixed length vector loads larger than NEON to masked_loads.
26571SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26572 SDValue Op, SelectionDAG &DAG) const {
26573 auto Load = cast<MaskedLoadSDNode>(Op);
26574
26575 SDLoc DL(Op);
26576 EVT VT = Op.getValueType();
26577 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26578
26579 SDValue Mask = Load->getMask();
26580 // If this is an extending load and the mask type is not the same as
26581 // load's type then we have to extend the mask type.
26582 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26583 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26584 "Incorrect mask type");
26585 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26586 }
26588
26589 SDValue PassThru;
26590 bool IsPassThruZeroOrUndef = false;
26591
26592 if (Load->getPassThru()->isUndef()) {
26593 PassThru = DAG.getUNDEF(ContainerVT);
26594 IsPassThruZeroOrUndef = true;
26595 } else {
26596 if (ContainerVT.isInteger())
26597 PassThru = DAG.getConstant(0, DL, ContainerVT);
26598 else
26599 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26600 if (isZerosVector(Load->getPassThru().getNode()))
26601 IsPassThruZeroOrUndef = true;
26602 }
26603
26604 SDValue NewLoad = DAG.getMaskedLoad(
26605 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26606 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26607 Load->getAddressingMode(), Load->getExtensionType());
26608
26609 SDValue Result = NewLoad;
26610 if (!IsPassThruZeroOrUndef) {
26611 SDValue OldPassThru =
26612 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26613 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26614 }
26615
26616 Result = convertFromScalableVector(DAG, VT, Result);
26617 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26618 return DAG.getMergeValues(MergedValues, DL);
26619}
26620
26621// Convert all fixed length vector stores larger than NEON to masked_stores.
26622SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26623 SDValue Op, SelectionDAG &DAG) const {
26624 auto Store = cast<StoreSDNode>(Op);
26625
26626 SDLoc DL(Op);
26627 EVT VT = Store->getValue().getValueType();
26628 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26629 EVT MemVT = Store->getMemoryVT();
26630
26631 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26632 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26633
26634 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26635 EVT TruncVT = ContainerVT.changeVectorElementType(
26636 Store->getMemoryVT().getVectorElementType());
26637 MemVT = MemVT.changeTypeToInteger();
26638 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26639 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26640 DAG.getUNDEF(TruncVT));
26641 NewValue =
26642 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26643 } else if (VT.isFloatingPoint()) {
26644 MemVT = MemVT.changeTypeToInteger();
26645 NewValue =
26646 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26647 }
26648
26649 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26650 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26651 Store->getMemOperand(), Store->getAddressingMode(),
26652 Store->isTruncatingStore());
26653}
26654
26655SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26656 SDValue Op, SelectionDAG &DAG) const {
26657 auto *Store = cast<MaskedStoreSDNode>(Op);
26658
26659 SDLoc DL(Op);
26660 EVT VT = Store->getValue().getValueType();
26661 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26662
26663 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26665
26666 return DAG.getMaskedStore(
26667 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26668 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26669 Store->getAddressingMode(), Store->isTruncatingStore());
26670}
26671
26672SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26673 SDValue Op, SelectionDAG &DAG) const {
26674 SDLoc dl(Op);
26675 EVT VT = Op.getValueType();
26676 EVT EltVT = VT.getVectorElementType();
26677
26678 bool Signed = Op.getOpcode() == ISD::SDIV;
26679 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26680
26681 bool Negated;
26682 uint64_t SplatVal;
26683 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26684 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26685 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26686 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26687
26688 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26689 SDValue Res =
26690 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26691 if (Negated)
26692 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26693 DAG.getConstant(0, dl, ContainerVT), Res);
26694
26695 return convertFromScalableVector(DAG, VT, Res);
26696 }
26697
26698 // Scalable vector i32/i64 DIV is supported.
26699 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26700 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26701
26702 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26703 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26704 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26705 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26706
26707 // If the wider type is legal: extend, op, and truncate.
26708 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26709 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26710 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26711 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26712 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26713 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26714 }
26715
26716 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26717 &ExtendOpcode](SDValue Op) {
26718 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26719 SDValue IdxHalf =
26720 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26721 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26722 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26723 return std::pair<SDValue, SDValue>(
26724 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26725 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26726 };
26727
26728 // If wider type is not legal: split, extend, op, trunc and concat.
26729 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26730 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26731 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26732 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26733 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26734 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26735 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26736}
26737
26738SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26739 SDValue Op, SelectionDAG &DAG) const {
26740 EVT VT = Op.getValueType();
26741 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26742
26743 SDLoc DL(Op);
26744 SDValue Val = Op.getOperand(0);
26745 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26746 Val = convertToScalableVector(DAG, ContainerVT, Val);
26747
26748 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26749 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26750
26751 // Repeatedly unpack Val until the result is of the desired element type.
26752 switch (ContainerVT.getSimpleVT().SimpleTy) {
26753 default:
26754 llvm_unreachable("unimplemented container type");
26755 case MVT::nxv16i8:
26756 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26757 if (VT.getVectorElementType() == MVT::i16)
26758 break;
26759 [[fallthrough]];
26760 case MVT::nxv8i16:
26761 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26762 if (VT.getVectorElementType() == MVT::i32)
26763 break;
26764 [[fallthrough]];
26765 case MVT::nxv4i32:
26766 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26767 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26768 break;
26769 }
26770
26771 return convertFromScalableVector(DAG, VT, Val);
26772}
26773
26774SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26775 SDValue Op, SelectionDAG &DAG) const {
26776 EVT VT = Op.getValueType();
26777 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26778
26779 SDLoc DL(Op);
26780 SDValue Val = Op.getOperand(0);
26781 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26782 Val = convertToScalableVector(DAG, ContainerVT, Val);
26783
26784 // Repeatedly truncate Val until the result is of the desired element type.
26785 switch (ContainerVT.getSimpleVT().SimpleTy) {
26786 default:
26787 llvm_unreachable("unimplemented container type");
26788 case MVT::nxv2i64:
26789 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26790 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26791 if (VT.getVectorElementType() == MVT::i32)
26792 break;
26793 [[fallthrough]];
26794 case MVT::nxv4i32:
26795 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26796 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26797 if (VT.getVectorElementType() == MVT::i16)
26798 break;
26799 [[fallthrough]];
26800 case MVT::nxv8i16:
26801 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26802 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26803 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26804 break;
26805 }
26806
26807 return convertFromScalableVector(DAG, VT, Val);
26808}
26809
26810SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26811 SDValue Op, SelectionDAG &DAG) const {
26812 EVT VT = Op.getValueType();
26813 EVT InVT = Op.getOperand(0).getValueType();
26814 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26815
26816 SDLoc DL(Op);
26817 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26818 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26819
26820 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26821}
26822
26823SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26824 SDValue Op, SelectionDAG &DAG) const {
26825 EVT VT = Op.getValueType();
26826 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26827
26828 SDLoc DL(Op);
26829 EVT InVT = Op.getOperand(0).getValueType();
26830 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26831 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26832
26833 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26834 Op.getOperand(1), Op.getOperand(2));
26835
26836 return convertFromScalableVector(DAG, VT, ScalableRes);
26837}
26838
26839// Convert vector operation 'Op' to an equivalent predicated operation whereby
26840// the original operation's type is used to construct a suitable predicate.
26841// NOTE: The results for inactive lanes are undefined.
26842SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26843 SelectionDAG &DAG,
26844 unsigned NewOp) const {
26845 EVT VT = Op.getValueType();
26846 SDLoc DL(Op);
26847 auto Pg = getPredicateForVector(DAG, DL, VT);
26848
26849 if (VT.isFixedLengthVector()) {
26850 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26851 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26852
26853 // Create list of operands by converting existing ones to scalable types.
26855 for (const SDValue &V : Op->op_values()) {
26856 if (isa<CondCodeSDNode>(V)) {
26857 Operands.push_back(V);
26858 continue;
26859 }
26860
26861 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26862 EVT VTArg = VTNode->getVT().getVectorElementType();
26863 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26864 Operands.push_back(DAG.getValueType(NewVTArg));
26865 continue;
26866 }
26867
26868 assert(isTypeLegal(V.getValueType()) &&
26869 "Expected only legal fixed-width types");
26870 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26871 }
26872
26873 if (isMergePassthruOpcode(NewOp))
26874 Operands.push_back(DAG.getUNDEF(ContainerVT));
26875
26876 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26877 return convertFromScalableVector(DAG, VT, ScalableRes);
26878 }
26879
26880 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26881
26883 for (const SDValue &V : Op->op_values()) {
26884 assert((!V.getValueType().isVector() ||
26885 V.getValueType().isScalableVector()) &&
26886 "Only scalable vectors are supported!");
26887 Operands.push_back(V);
26888 }
26889
26890 if (isMergePassthruOpcode(NewOp))
26891 Operands.push_back(DAG.getUNDEF(VT));
26892
26893 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26894}
26895
26896// If a fixed length vector operation has no side effects when applied to
26897// undefined elements, we can safely use scalable vectors to perform the same
26898// operation without needing to worry about predication.
26899SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26900 SelectionDAG &DAG) const {
26901 EVT VT = Op.getValueType();
26903 "Only expected to lower fixed length vector operation!");
26904 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26905
26906 // Create list of operands by converting existing ones to scalable types.
26908 for (const SDValue &V : Op->op_values()) {
26909 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26910
26911 // Pass through non-vector operands.
26912 if (!V.getValueType().isVector()) {
26913 Ops.push_back(V);
26914 continue;
26915 }
26916
26917 // "cast" fixed length vector to a scalable vector.
26918 assert(V.getValueType().isFixedLengthVector() &&
26919 isTypeLegal(V.getValueType()) &&
26920 "Only fixed length vectors are supported!");
26921 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26922 }
26923
26924 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26925 return convertFromScalableVector(DAG, VT, ScalableRes);
26926}
26927
26928SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26929 SelectionDAG &DAG) const {
26930 SDLoc DL(ScalarOp);
26931 SDValue AccOp = ScalarOp.getOperand(0);
26932 SDValue VecOp = ScalarOp.getOperand(1);
26933 EVT SrcVT = VecOp.getValueType();
26934 EVT ResVT = SrcVT.getVectorElementType();
26935
26936 EVT ContainerVT = SrcVT;
26937 if (SrcVT.isFixedLengthVector()) {
26938 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26939 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26940 }
26941
26942 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26943 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26944
26945 // Convert operands to Scalable.
26946 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26947 DAG.getUNDEF(ContainerVT), AccOp, Zero);
26948
26949 // Perform reduction.
26950 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26951 Pg, AccOp, VecOp);
26952
26953 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26954}
26955
26956SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26957 SelectionDAG &DAG) const {
26958 SDLoc DL(ReduceOp);
26959 SDValue Op = ReduceOp.getOperand(0);
26960 EVT OpVT = Op.getValueType();
26961 EVT VT = ReduceOp.getValueType();
26962
26963 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26964 return SDValue();
26965
26966 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26967
26968 switch (ReduceOp.getOpcode()) {
26969 default:
26970 return SDValue();
26971 case ISD::VECREDUCE_OR:
26972 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26973 // The predicate can be 'Op' because
26974 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26975 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26976 else
26977 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26978 case ISD::VECREDUCE_AND: {
26979 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26980 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26981 }
26982 case ISD::VECREDUCE_XOR: {
26983 SDValue ID =
26984 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26985 if (OpVT == MVT::nxv1i1) {
26986 // Emulate a CNTP on .Q using .D and a different governing predicate.
26987 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26988 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26989 }
26990 SDValue Cntp =
26991 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26992 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26993 }
26994 }
26995
26996 return SDValue();
26997}
26998
26999SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27000 SDValue ScalarOp,
27001 SelectionDAG &DAG) const {
27002 SDLoc DL(ScalarOp);
27003 SDValue VecOp = ScalarOp.getOperand(0);
27004 EVT SrcVT = VecOp.getValueType();
27005
27007 SrcVT,
27008 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27009 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27010 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27011 }
27012
27013 // UADDV always returns an i64 result.
27014 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27015 SrcVT.getVectorElementType();
27016 EVT RdxVT = SrcVT;
27017 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27018 RdxVT = getPackedSVEVectorVT(ResVT);
27019
27020 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27021 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27022 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27023 Rdx, DAG.getConstant(0, DL, MVT::i64));
27024
27025 // The VEC_REDUCE nodes expect an element size result.
27026 if (ResVT != ScalarOp.getValueType())
27027 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27028
27029 return Res;
27030}
27031
27032SDValue
27033AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27034 SelectionDAG &DAG) const {
27035 EVT VT = Op.getValueType();
27036 SDLoc DL(Op);
27037
27038 EVT InVT = Op.getOperand(1).getValueType();
27039 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27040 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27041 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27042
27043 // Convert the mask to a predicated (NOTE: We don't need to worry about
27044 // inactive lanes since VSELECT is safe when given undefined elements).
27045 EVT MaskVT = Op.getOperand(0).getValueType();
27046 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27047 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27049 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27050
27051 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27052 Mask, Op1, Op2);
27053
27054 return convertFromScalableVector(DAG, VT, ScalableRes);
27055}
27056
27057SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27058 SDValue Op, SelectionDAG &DAG) const {
27059 SDLoc DL(Op);
27060 EVT InVT = Op.getOperand(0).getValueType();
27061 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27062
27063 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27064 "Only expected to lower fixed length vector operation!");
27065 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27066 "Expected integer result of the same bit length as the inputs!");
27067
27068 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27069 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27070 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27071
27072 EVT CmpVT = Pg.getValueType();
27073 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27074 {Pg, Op1, Op2, Op.getOperand(2)});
27075
27076 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27077 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27078 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27079}
27080
27081SDValue
27082AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27083 SelectionDAG &DAG) const {
27084 SDLoc DL(Op);
27085 auto SrcOp = Op.getOperand(0);
27086 EVT VT = Op.getValueType();
27087 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27088 EVT ContainerSrcVT =
27089 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27090
27091 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27092 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27093 return convertFromScalableVector(DAG, VT, Op);
27094}
27095
27096SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27097 SDValue Op, SelectionDAG &DAG) const {
27098 SDLoc DL(Op);
27099 unsigned NumOperands = Op->getNumOperands();
27100
27101 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27102 "Unexpected number of operands in CONCAT_VECTORS");
27103
27104 auto SrcOp1 = Op.getOperand(0);
27105 auto SrcOp2 = Op.getOperand(1);
27106 EVT VT = Op.getValueType();
27107 EVT SrcVT = SrcOp1.getValueType();
27108
27109 if (NumOperands > 2) {
27111 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27112 for (unsigned I = 0; I < NumOperands; I += 2)
27113 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27114 Op->getOperand(I), Op->getOperand(I + 1)));
27115
27116 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27117 }
27118
27119 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27120
27122 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27123 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27124
27125 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27126
27127 return convertFromScalableVector(DAG, VT, Op);
27128}
27129
27130SDValue
27131AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27132 SelectionDAG &DAG) const {
27133 EVT VT = Op.getValueType();
27134 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27135
27136 SDLoc DL(Op);
27137 SDValue Val = Op.getOperand(0);
27138 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27139 EVT SrcVT = Val.getValueType();
27140 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27141 EVT ExtendVT = ContainerVT.changeVectorElementType(
27142 SrcVT.getVectorElementType());
27143
27144 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27145 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27146
27147 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27148 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27149 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27150 Pg, Val, DAG.getUNDEF(ContainerVT));
27151
27152 return convertFromScalableVector(DAG, VT, Val);
27153}
27154
27155SDValue
27156AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27157 SelectionDAG &DAG) const {
27158 EVT VT = Op.getValueType();
27159 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27160
27161 SDLoc DL(Op);
27162 SDValue Val = Op.getOperand(0);
27163 EVT SrcVT = Val.getValueType();
27164 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27165 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27167 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27168
27169 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27170 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27171 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27172 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27173 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27174
27175 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27176 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27177}
27178
27179SDValue
27180AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27181 SelectionDAG &DAG) const {
27182 EVT VT = Op.getValueType();
27183 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27184
27185 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27186 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27188
27189 SDLoc DL(Op);
27190 SDValue Val = Op.getOperand(0);
27191 EVT SrcVT = Val.getValueType();
27192 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27193 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27194
27195 if (VT.bitsGE(SrcVT)) {
27197
27198 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27199 VT.changeTypeToInteger(), Val);
27200
27201 // Safe to use a larger than specified operand because by promoting the
27202 // value nothing has changed from an arithmetic point of view.
27203 Val =
27204 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27205 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27206 DAG.getUNDEF(ContainerDstVT));
27207 return convertFromScalableVector(DAG, VT, Val);
27208 } else {
27209 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27210 ContainerDstVT.getVectorElementType());
27212
27213 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27214 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27215 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27216 Val = convertFromScalableVector(DAG, SrcVT, Val);
27217
27218 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27219 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27220 }
27221}
27222
27223SDValue
27224AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27225 SelectionDAG &DAG) const {
27226 SDLoc DL(Op);
27227 EVT OpVT = Op.getValueType();
27228 assert(OpVT.isScalableVector() &&
27229 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27230 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27231 Op.getOperand(1));
27232 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27233 Op.getOperand(1));
27234 return DAG.getMergeValues({Even, Odd}, DL);
27235}
27236
27237SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27238 SelectionDAG &DAG) const {
27239 SDLoc DL(Op);
27240 EVT OpVT = Op.getValueType();
27241 assert(OpVT.isScalableVector() &&
27242 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27243
27244 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27245 Op.getOperand(1));
27246 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27247 Op.getOperand(1));
27248 return DAG.getMergeValues({Lo, Hi}, DL);
27249}
27250
27251SDValue
27252AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27253 SelectionDAG &DAG) const {
27254 EVT VT = Op.getValueType();
27255 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27256
27257 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27258 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27260
27261 SDLoc DL(Op);
27262 SDValue Val = Op.getOperand(0);
27263 EVT SrcVT = Val.getValueType();
27264 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27265 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27266
27267 if (VT.bitsGT(SrcVT)) {
27268 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27269 ContainerSrcVT.getVectorElementType());
27271
27272 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27273 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27274
27275 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27276 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27277 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27278 DAG.getUNDEF(ContainerDstVT));
27279 return convertFromScalableVector(DAG, VT, Val);
27280 } else {
27281 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27283
27284 // Safe to use a larger than specified result since an fp_to_int where the
27285 // result doesn't fit into the destination is undefined.
27286 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27287 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27288 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27289
27290 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27291 }
27292}
27293
27295 ArrayRef<int> ShuffleMask, EVT VT,
27296 EVT ContainerVT, SelectionDAG &DAG) {
27297 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27298 SDLoc DL(Op);
27299 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27300 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27301 bool IsSingleOp =
27302 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27303
27304 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27305 MinSVESize = 128;
27306
27307 // Ignore two operands if no SVE2 or all index numbers couldn't
27308 // be represented.
27309 if (!IsSingleOp && !Subtarget.hasSVE2())
27310 return SDValue();
27311
27312 EVT VTOp1 = Op.getOperand(0).getValueType();
27313 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27314 unsigned IndexLen = MinSVESize / BitsPerElt;
27315 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27316 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27317 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27318 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27319 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27320 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27321 "Incorrectly legalised shuffle operation");
27322
27324 // If MinSVESize is not equal to MaxSVESize then we need to know which
27325 // TBL mask element needs adjustment.
27326 SmallVector<SDValue, 8> AddRuntimeVLMask;
27327
27328 // Bail out for 8-bits element types, because with 2048-bit SVE register
27329 // size 8 bits is only sufficient to index into the first source vector.
27330 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27331 return SDValue();
27332
27333 for (int Index : ShuffleMask) {
27334 // Handling poison index value.
27335 if (Index < 0)
27336 Index = 0;
27337 // If the mask refers to elements in the second operand, then we have to
27338 // offset the index by the number of elements in a vector. If this is number
27339 // is not known at compile-time, we need to maintain a mask with 'VL' values
27340 // to add at runtime.
27341 if ((unsigned)Index >= ElementsPerVectorReg) {
27342 if (MinMaxEqual) {
27343 Index += IndexLen - ElementsPerVectorReg;
27344 } else {
27345 Index = Index - ElementsPerVectorReg;
27346 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27347 }
27348 } else if (!MinMaxEqual)
27349 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27350 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27351 // to 255, this might point to the last element of in the second operand
27352 // of the shufflevector, thus we are rejecting this transform.
27353 if ((unsigned)Index >= MaxOffset)
27354 return SDValue();
27355 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27356 }
27357
27358 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27359 // value where it would perform first lane duplication for out of
27360 // index elements. For i8 elements an out-of-range index could be a valid
27361 // for 2048-bit vector register size.
27362 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27363 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27364 if (!MinMaxEqual)
27365 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27366 }
27367
27368 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27369 SDValue VecMask =
27370 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27371 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27372
27373 SDValue Shuffle;
27374 if (IsSingleOp)
27375 Shuffle =
27376 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27377 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27378 Op1, SVEMask);
27379 else if (Subtarget.hasSVE2()) {
27380 if (!MinMaxEqual) {
27381 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27382 SDValue VScale = (BitsPerElt == 64)
27383 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27384 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27385 SDValue VecMask =
27386 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27387 SDValue MulByMask = DAG.getNode(
27388 ISD::MUL, DL, MaskType,
27389 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27390 DAG.getBuildVector(MaskType, DL,
27391 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27392 SDValue UpdatedVecMask =
27393 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27394 SVEMask = convertToScalableVector(
27395 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27396 }
27397 Shuffle =
27398 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27399 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27400 Op1, Op2, SVEMask);
27401 }
27402 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27403 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27404}
27405
27406SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27407 SDValue Op, SelectionDAG &DAG) const {
27408 EVT VT = Op.getValueType();
27409 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27410
27411 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27412 auto ShuffleMask = SVN->getMask();
27413
27414 SDLoc DL(Op);
27415 SDValue Op1 = Op.getOperand(0);
27416 SDValue Op2 = Op.getOperand(1);
27417
27418 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27419 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27420 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27421
27422 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27423 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27424 return MVT::i32;
27425 return ScalarTy;
27426 };
27427
27428 if (SVN->isSplat()) {
27429 unsigned Lane = std::max(0, SVN->getSplatIndex());
27430 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27431 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27432 DAG.getConstant(Lane, DL, MVT::i64));
27433 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27434 return convertFromScalableVector(DAG, VT, Op);
27435 }
27436
27437 bool ReverseEXT = false;
27438 unsigned Imm;
27439 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27440 Imm == VT.getVectorNumElements() - 1) {
27441 if (ReverseEXT)
27442 std::swap(Op1, Op2);
27443 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27444 SDValue Scalar = DAG.getNode(
27445 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27446 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27447 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27448 return convertFromScalableVector(DAG, VT, Op);
27449 }
27450
27451 for (unsigned LaneSize : {64U, 32U, 16U}) {
27452 if (isREVMask(ShuffleMask, VT, LaneSize)) {
27453 EVT NewVT =
27455 unsigned RevOp;
27456 unsigned EltSz = VT.getScalarSizeInBits();
27457 if (EltSz == 8)
27459 else if (EltSz == 16)
27461 else
27463
27464 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27465 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27466 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27467 return convertFromScalableVector(DAG, VT, Op);
27468 }
27469 }
27470
27471 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27472 isREVMask(ShuffleMask, VT, 128)) {
27473 if (!VT.isFloatingPoint())
27474 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27475
27477 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27478 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27479 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27480 return convertFromScalableVector(DAG, VT, Op);
27481 }
27482
27483 unsigned WhichResult;
27484 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27486 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27487
27488 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
27489 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27491 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27492 }
27493
27494 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27496 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27497
27498 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27499 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27501 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27502 }
27503
27504 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27505 // represents the same logical operation as performed by a ZIP instruction. In
27506 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27507 // equivalent to an AArch64 instruction. There's the extra component of
27508 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27509 // only operated on 64/128bit vector types that have a direct mapping to a
27510 // target register and so an exact mapping is implied.
27511 // However, when using SVE for fixed length vectors, most legal vector types
27512 // are actually sub-vectors of a larger SVE register. When mapping
27513 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27514 // how the mask's indices translate. Specifically, when the mapping requires
27515 // an exact meaning for a specific vector index (e.g. Index X is the last
27516 // vector element in the register) then such mappings are often only safe when
27517 // the exact SVE register size is know. The main exception to this is when
27518 // indices are logically relative to the first element of either
27519 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27520 // when converting from fixed-length to scalable vector types (i.e. the start
27521 // of a fixed length vector is always the start of a scalable vector).
27522 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27523 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27524 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27525 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27526 Op2.isUndef()) {
27527 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27528 return convertFromScalableVector(DAG, VT, Op);
27529 }
27530
27531 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27533 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27534
27535 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
27536 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27538 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27539 }
27540
27541 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27543 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27544
27545 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27546 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27548 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27549 }
27550 }
27551
27552 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27553 // unless NEON is not available and we can assume minimal SVE register size is
27554 // 128-bits.
27555 if (MinSVESize || !Subtarget->isNeonAvailable())
27556 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27557 DAG);
27558
27559 return SDValue();
27560}
27561
27562SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27563 SelectionDAG &DAG) const {
27564 SDLoc DL(Op);
27565 EVT InVT = Op.getValueType();
27566
27567 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27568 InVT.isScalableVector() && isTypeLegal(InVT) &&
27569 "Only expect to cast between legal scalable vector types!");
27570 assert(VT.getVectorElementType() != MVT::i1 &&
27571 InVT.getVectorElementType() != MVT::i1 &&
27572 "For predicate bitcasts, use getSVEPredicateBitCast");
27573
27574 if (InVT == VT)
27575 return Op;
27576
27578 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27579
27580 // Safe bitcasting between unpacked vector types of different element counts
27581 // is currently unsupported because the following is missing the necessary
27582 // work to ensure the result's elements live where they're supposed to within
27583 // an SVE register.
27584 // 01234567
27585 // e.g. nxv2i32 = XX??XX??
27586 // nxv4f16 = X?X?X?X?
27588 VT == PackedVT || InVT == PackedInVT) &&
27589 "Unexpected bitcast!");
27590
27591 // Pack input if required.
27592 if (InVT != PackedInVT)
27593 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27594
27595 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27596
27597 // Unpack result if required.
27598 if (VT != PackedVT)
27600
27601 return Op;
27602}
27603
27605 SDValue N) const {
27606 return ::isAllActivePredicate(DAG, N);
27607}
27608
27610 return ::getPromotedVTForPredicate(VT);
27611}
27612
27613bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27614 SDValue Op, const APInt &OriginalDemandedBits,
27615 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27616 unsigned Depth) const {
27617
27618 unsigned Opc = Op.getOpcode();
27619 switch (Opc) {
27620 case AArch64ISD::VSHL: {
27621 // Match (VSHL (VLSHR Val X) X)
27622 SDValue ShiftL = Op;
27623 SDValue ShiftR = Op->getOperand(0);
27624 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27625 return false;
27626
27627 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27628 return false;
27629
27630 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27631 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27632
27633 // Other cases can be handled as well, but this is not
27634 // implemented.
27635 if (ShiftRBits != ShiftLBits)
27636 return false;
27637
27638 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27639 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27640
27641 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27642 APInt UnusedBits = ~OriginalDemandedBits;
27643
27644 if ((ZeroBits & UnusedBits) != ZeroBits)
27645 return false;
27646
27647 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27648 // used - simplify to just Val.
27649 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27650 }
27651 case AArch64ISD::BICi: {
27652 // Fold BICi if all destination bits already known to be zeroed
27653 SDValue Op0 = Op.getOperand(0);
27654 KnownBits KnownOp0 =
27655 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
27656 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27657 uint64_t BitsToClear = Op->getConstantOperandVal(1)
27658 << Op->getConstantOperandVal(2);
27659 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27660 if (APInt(Known.getBitWidth(), BitsToClear)
27661 .isSubsetOf(AlreadyZeroedBitsToClear))
27662 return TLO.CombineTo(Op, Op0);
27663
27664 Known = KnownOp0 &
27665 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
27666
27667 return false;
27668 }
27670 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27671 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27672 if (!MaxSVEVectorSizeInBits)
27673 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27674 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27675 // The SVE count intrinsics don't support the multiplier immediate so we
27676 // don't have to account for that here. The value returned may be slightly
27677 // over the true required bits, as this is based on the "ALL" pattern. The
27678 // other patterns are also exposed by these intrinsics, but they all
27679 // return a value that's strictly less than "ALL".
27680 unsigned RequiredBits = llvm::bit_width(MaxElements);
27681 unsigned BitWidth = Known.Zero.getBitWidth();
27682 if (RequiredBits < BitWidth)
27683 Known.Zero.setHighBits(BitWidth - RequiredBits);
27684 return false;
27685 }
27686 }
27687 }
27688
27690 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27691}
27692
27693bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27694 return Op.getOpcode() == AArch64ISD::DUP ||
27695 Op.getOpcode() == AArch64ISD::MOVI ||
27696 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27697 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27699}
27700
27702 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27703 Subtarget->hasComplxNum();
27704}
27705
27708 auto *VTy = dyn_cast<VectorType>(Ty);
27709 if (!VTy)
27710 return false;
27711
27712 // If the vector is scalable, SVE is enabled, implying support for complex
27713 // numbers. Otherwise, we need to ensure complex number support is available
27714 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27715 return false;
27716
27717 auto *ScalarTy = VTy->getScalarType();
27718 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27719
27720 // We can only process vectors that have a bit size of 128 or higher (with an
27721 // additional 64 bits for Neon). Additionally, these vectors must have a
27722 // power-of-2 size, as we later split them into the smallest supported size
27723 // and merging them back together after applying complex operation.
27724 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27725 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27726 !llvm::isPowerOf2_32(VTyWidth))
27727 return false;
27728
27729 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27730 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27731 return 8 <= ScalarWidth && ScalarWidth <= 64;
27732 }
27733
27734 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27735 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27736}
27737
27740 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27741 Value *Accumulator) const {
27742 VectorType *Ty = cast<VectorType>(InputA->getType());
27743 bool IsScalable = Ty->isScalableTy();
27744 bool IsInt = Ty->getElementType()->isIntegerTy();
27745
27746 unsigned TyWidth =
27748
27749 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27750 "Vector type must be either 64 or a power of 2 that is at least 128");
27751
27752 if (TyWidth > 128) {
27753 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27754 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27755 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27756 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27757 auto *UpperSplitA =
27758 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27759 auto *UpperSplitB =
27760 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27761 Value *LowerSplitAcc = nullptr;
27762 Value *UpperSplitAcc = nullptr;
27763 if (Accumulator) {
27764 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27765 UpperSplitAcc =
27766 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27767 }
27768 auto *LowerSplitInt = createComplexDeinterleavingIR(
27769 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27770 auto *UpperSplitInt = createComplexDeinterleavingIR(
27771 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27772
27773 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27774 B.getInt64(0));
27775 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27776 }
27777
27778 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27779 if (Accumulator == nullptr)
27781
27782 if (IsScalable) {
27783 if (IsInt)
27784 return B.CreateIntrinsic(
27785 Intrinsic::aarch64_sve_cmla_x, Ty,
27786 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27787
27788 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27789 return B.CreateIntrinsic(
27790 Intrinsic::aarch64_sve_fcmla, Ty,
27791 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27792 }
27793
27794 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27795 Intrinsic::aarch64_neon_vcmla_rot90,
27796 Intrinsic::aarch64_neon_vcmla_rot180,
27797 Intrinsic::aarch64_neon_vcmla_rot270};
27798
27799
27800 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27801 {Accumulator, InputA, InputB});
27802 }
27803
27804 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27805 if (IsScalable) {
27808 if (IsInt)
27809 return B.CreateIntrinsic(
27810 Intrinsic::aarch64_sve_cadd_x, Ty,
27811 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27812
27813 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27814 return B.CreateIntrinsic(
27815 Intrinsic::aarch64_sve_fcadd, Ty,
27816 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27817 }
27818 return nullptr;
27819 }
27820
27823 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27825 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27826
27827 if (IntId == Intrinsic::not_intrinsic)
27828 return nullptr;
27829
27830 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27831 }
27832
27833 return nullptr;
27834}
27835
27836bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27837 unsigned Opc = N->getOpcode();
27838 if (ISD::isExtOpcode(Opc)) {
27839 if (any_of(N->uses(),
27840 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27841 return false;
27842 }
27843 return true;
27844}
27845
27846unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27847 return Subtarget->getMinimumJumpTableEntries();
27848}
27849
27852 EVT VT) const {
27853 bool NonUnitFixedLengthVector =
27855 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27857
27858 EVT VT1;
27859 MVT RegisterVT;
27860 unsigned NumIntermediates;
27861 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27862 RegisterVT);
27863 return RegisterVT;
27864}
27865
27867 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27868 bool NonUnitFixedLengthVector =
27870 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27872
27873 EVT VT1;
27874 MVT VT2;
27875 unsigned NumIntermediates;
27877 NumIntermediates, VT2);
27878}
27879
27881 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27882 unsigned &NumIntermediates, MVT &RegisterVT) const {
27884 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27885 if (!RegisterVT.isFixedLengthVector() ||
27886 RegisterVT.getFixedSizeInBits() <= 128)
27887 return NumRegs;
27888
27889 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27890 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27891 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27892
27893 // A size mismatch here implies either type promotion or widening and would
27894 // have resulted in scalarisation if larger vectors had not be available.
27895 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27896 EVT EltTy = VT.getVectorElementType();
27898 if (!isTypeLegal(NewVT))
27899 NewVT = EltTy;
27900
27901 IntermediateVT = NewVT;
27902 NumIntermediates = VT.getVectorNumElements();
27903 RegisterVT = getRegisterType(Context, NewVT);
27904 return NumIntermediates;
27905 }
27906
27907 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27908 // types for vector arguments and returns.
27909
27910 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27911 NumIntermediates *= NumSubRegs;
27912 NumRegs *= NumSubRegs;
27913
27914 switch (RegisterVT.getVectorElementType().SimpleTy) {
27915 default:
27916 llvm_unreachable("unexpected element type for vector");
27917 case MVT::i8:
27918 IntermediateVT = RegisterVT = MVT::v16i8;
27919 break;
27920 case MVT::i16:
27921 IntermediateVT = RegisterVT = MVT::v8i16;
27922 break;
27923 case MVT::i32:
27924 IntermediateVT = RegisterVT = MVT::v4i32;
27925 break;
27926 case MVT::i64:
27927 IntermediateVT = RegisterVT = MVT::v2i64;
27928 break;
27929 case MVT::f16:
27930 IntermediateVT = RegisterVT = MVT::v8f16;
27931 break;
27932 case MVT::f32:
27933 IntermediateVT = RegisterVT = MVT::v4f32;
27934 break;
27935 case MVT::f64:
27936 IntermediateVT = RegisterVT = MVT::v2f64;
27937 break;
27938 case MVT::bf16:
27939 IntermediateVT = RegisterVT = MVT::v8bf16;
27940 break;
27941 }
27942
27943 return NumRegs;
27944}
27945
27947 const MachineFunction &MF) const {
27948 return !Subtarget->isTargetWindows() &&
27949 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27950}
27951
27952#ifndef NDEBUG
27954 switch (N->getOpcode()) {
27955 default:
27956 break;
27960 case AArch64ISD::UUNPKHI: {
27961 assert(N->getNumValues() == 1 && "Expected one result!");
27962 assert(N->getNumOperands() == 1 && "Expected one operand!");
27963 EVT VT = N->getValueType(0);
27964 EVT OpVT = N->getOperand(0).getValueType();
27965 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
27966 VT.isInteger() && "Expected integer vectors!");
27967 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
27968 "Expected vectors of equal size!");
27969 // TODO: Enable assert once bogus creations have been fixed.
27970 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
27971 // "Expected result vector with half the lanes of its input!");
27972 break;
27973 }
27974 case AArch64ISD::TRN1:
27975 case AArch64ISD::TRN2:
27976 case AArch64ISD::UZP1:
27977 case AArch64ISD::UZP2:
27978 case AArch64ISD::ZIP1:
27979 case AArch64ISD::ZIP2: {
27980 assert(N->getNumValues() == 1 && "Expected one result!");
27981 assert(N->getNumOperands() == 2 && "Expected two operands!");
27982 EVT VT = N->getValueType(0);
27983 EVT Op0VT = N->getOperand(0).getValueType();
27984 EVT Op1VT = N->getOperand(1).getValueType();
27985 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
27986 "Expected vectors!");
27987 // TODO: Enable assert once bogus creations have been fixed.
27988 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
27989 break;
27990 }
27991 }
27992}
27993#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static bool isREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static bool isTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:307
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1907
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
arg_iterator arg_end()
Definition: Function.h:827
arg_iterator arg_begin()
Definition: Function.h:818
size_t size() const
Definition: Function.h:808
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1037
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1212
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2067
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1437
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:476
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2081
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1416
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:690
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:462
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:676
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:629
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:239
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1346
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1377
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1028
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1362
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1366
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1032
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1376
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1407
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:663
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1359
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1363
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ STRICT_LROUND
Definition: ISDOpcodes.h:432
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:587
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:647
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ STRICT_FPOWI
Definition: ISDOpcodes.h:414
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1378
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1371
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1023
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1273
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1336
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1255
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ STRICT_LRINT
Definition: ISDOpcodes.h:434
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:592
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1379
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:658
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:435
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:613
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:433
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1367
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:581
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1601
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1492
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1479
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1481
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1509
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:434
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:376
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:291
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64