LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
848
850 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
853 } else {
856 }
859
860 // Generate outline atomics library calls only if LSE was not specified for
861 // subtarget
862 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
888#define LCALLNAMES(A, B, N) \
889 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
890 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
891 setLibcallName(A##N##_REL, #B #N "_rel"); \
892 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
893#define LCALLNAME4(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
896#define LCALLNAME5(A, B) \
897 LCALLNAMES(A, B, 1) \
898 LCALLNAMES(A, B, 2) \
899 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
900 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
903 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
904 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
905 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
906#undef LCALLNAMES
907#undef LCALLNAME4
908#undef LCALLNAME5
909 }
910
911 if (Subtarget->hasLSE128()) {
912 // Custom lowering because i128 is not legal. Must be replaced by 2x64
913 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
917 }
918
919 // 128-bit loads and stores can be done without expanding
922
923 // Aligned 128-bit loads and stores are single-copy atomic according to the
924 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
925 if (Subtarget->hasLSE2()) {
928 }
929
930 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
931 // custom lowering, as there are no un-paired non-temporal stores and
932 // legalization will break up 256 bit inputs.
934 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
935 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
936 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
941
942 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
943 // custom lowering, as there are no un-paired non-temporal loads legalization
944 // will break up 256 bit inputs.
945 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
946 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
947 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
948 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
949 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
950 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
951 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
952 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
953
954 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
956
957 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
958 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
959 // Issue __sincos_stret if available.
962 } else {
965 }
966
967 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
968 // MSVCRT doesn't have powi; fall back to pow
969 setLibcallName(RTLIB::POWI_F32, nullptr);
970 setLibcallName(RTLIB::POWI_F64, nullptr);
971 }
972
973 // Make floating-point constants legal for the large code model, so they don't
974 // become loads from the constant pool.
975 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
978 }
979
980 // AArch64 does not have floating-point extending loads, i1 sign-extending
981 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
982 for (MVT VT : MVT::fp_valuetypes()) {
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
985 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
986 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
987 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
988 }
989 for (MVT VT : MVT::integer_valuetypes())
991
992 for (MVT WideVT : MVT::fp_valuetypes()) {
993 for (MVT NarrowVT : MVT::fp_valuetypes()) {
994 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
995 setTruncStoreAction(WideVT, NarrowVT, Expand);
996 }
997 }
998 }
999
1000 if (Subtarget->hasFPARMv8()) {
1004 }
1005
1006 // Indexed loads and stores are supported.
1007 for (unsigned im = (unsigned)ISD::PRE_INC;
1009 setIndexedLoadAction(im, MVT::i8, Legal);
1010 setIndexedLoadAction(im, MVT::i16, Legal);
1011 setIndexedLoadAction(im, MVT::i32, Legal);
1012 setIndexedLoadAction(im, MVT::i64, Legal);
1013 setIndexedLoadAction(im, MVT::f64, Legal);
1014 setIndexedLoadAction(im, MVT::f32, Legal);
1015 setIndexedLoadAction(im, MVT::f16, Legal);
1016 setIndexedLoadAction(im, MVT::bf16, Legal);
1017 setIndexedStoreAction(im, MVT::i8, Legal);
1018 setIndexedStoreAction(im, MVT::i16, Legal);
1019 setIndexedStoreAction(im, MVT::i32, Legal);
1020 setIndexedStoreAction(im, MVT::i64, Legal);
1021 setIndexedStoreAction(im, MVT::f64, Legal);
1022 setIndexedStoreAction(im, MVT::f32, Legal);
1023 setIndexedStoreAction(im, MVT::f16, Legal);
1024 setIndexedStoreAction(im, MVT::bf16, Legal);
1025 }
1026
1027 // Trap.
1028 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1031
1032 // We combine OR nodes for bitfield operations.
1034 // Try to create BICs for vector ANDs.
1036
1037 // Vector add and sub nodes may conceal a high-half opportunity.
1038 // Also, try to fold ADD into CSINC/CSINV..
1041
1044
1045 // Try and combine setcc with csel
1047
1049
1056
1058
1060
1062
1066
1068
1070
1072
1074
1078
1080
1081 // In case of strict alignment, avoid an excessive number of byte wide stores.
1084 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1085
1089 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1090
1093
1096 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1097
1099
1101
1102 EnableExtLdPromotion = true;
1103
1104 // Set required alignment.
1106 // Set preferred alignments.
1107
1108 // Don't align loops on Windows. The SEH unwind info generation needs to
1109 // know the exact length of functions before the alignments have been
1110 // expanded.
1111 if (!Subtarget->isTargetWindows())
1115
1116 // Only change the limit for entries in a jump table if specified by
1117 // the sub target, but not at the command line.
1118 unsigned MaxJT = STI.getMaximumJumpTableSize();
1119 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1121
1123
1125
1127
1128 if (Subtarget->hasNEON()) {
1129 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1130 // silliness like this:
1131 for (auto Op :
1149 setOperationAction(Op, MVT::v1f64, Expand);
1150
1151 for (auto Op :
1156 setOperationAction(Op, MVT::v1i64, Expand);
1157
1158 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1159 // elements smaller than i32, so promote the input to i32 first.
1160 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1161 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1162
1163 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1164 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1165 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1168 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1170
1171 if (Subtarget->hasFullFP16()) {
1174
1183 } else {
1184 // when AArch64 doesn't have fullfp16 support, promote the input
1185 // to i32 first.
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1188 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1191 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1192 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1193 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1194 }
1195
1196 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1197 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1204 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1209 }
1210
1211 // Custom handling for some quad-vector types to detect MULL.
1212 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1215 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1216 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1217 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1218
1219 // Saturates
1220 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1221 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1226 }
1227
1228 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1229 MVT::v4i32}) {
1236 }
1237
1238 // Vector reductions
1239 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1240 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1241 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1246
1248 }
1249 }
1250 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1251 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1260 }
1265
1267 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1268 // Likewise, narrowing and extending vector loads/stores aren't handled
1269 // directly.
1272
1273 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1276 } else {
1279 }
1282
1285
1286 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1287 setTruncStoreAction(VT, InnerVT, Expand);
1288 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1289 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1290 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1291 }
1292 }
1293
1294 // AArch64 has implementations of a lot of rounding-like FP operations.
1295 for (auto Op :
1300 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1302 if (Subtarget->hasFullFP16())
1303 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1305 }
1306
1307 // LRINT and LLRINT.
1308 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1309 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1311 if (Subtarget->hasFullFP16())
1312 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1314 }
1315
1316 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1317
1322
1326
1327 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1328 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1329 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1330 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1331 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1332 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1333
1334 // ADDP custom lowering
1335 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1337 // FADDP custom lowering
1338 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1340 }
1341
1342 if (Subtarget->hasSME()) {
1344 }
1345
1346 // FIXME: Move lowering for more nodes here if those are common between
1347 // SVE and SME.
1348 if (Subtarget->hasSVEorSME()) {
1349 for (auto VT :
1350 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1355 }
1356 }
1357
1358 if (Subtarget->hasSVEorSME()) {
1359 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1402
1408
1417
1422
1423 if (!Subtarget->isLittleEndian())
1425
1426 if (Subtarget->hasSVE2orSME())
1427 // For SLI/SRI.
1429 }
1430
1431 // Illegal unpacked integer vector types.
1432 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1435 }
1436
1437 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1438 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1439 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1441
1442 for (auto VT :
1443 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1444 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1446
1447 for (auto VT :
1448 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1456
1460
1461 // There are no legal MVT::nxv16f## based types.
1462 if (VT != MVT::nxv16i1) {
1465 }
1466 }
1467
1468 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1469 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1470 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1471 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1476 }
1477
1478 // Firstly, exclude all scalable vector extending loads/truncating stores,
1479 // include both integer and floating scalable vector.
1481 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1482 setTruncStoreAction(VT, InnerVT, Expand);
1483 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1484 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1485 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1486 }
1487 }
1488
1489 // Then, selectively enable those which we directly support.
1490 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1491 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1492 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1493 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1494 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1495 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1496 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1497 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1498 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1499 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1500 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1501 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1502 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1503 }
1504
1505 // SVE supports truncating stores of 64 and 128-bit vectors
1506 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1507 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1508 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1509 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1510 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1511
1512 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1513 MVT::nxv4f32, MVT::nxv2f64}) {
1551 if (Subtarget->isSVEAvailable())
1556
1570
1582
1583 if (!Subtarget->isLittleEndian())
1585 }
1586
1587 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1595
1596 if (!Subtarget->isLittleEndian())
1598 }
1599
1602
1603 // NEON doesn't support integer divides, but SVE does
1604 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1605 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1608 }
1609
1610 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1611 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1612 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1613
1614 if (Subtarget->isSVEAvailable()) {
1615 // NEON doesn't support across-vector reductions, but SVE does.
1616 for (auto VT :
1617 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1619 }
1620
1621 // Histcnt is SVE2 only
1622 if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
1624 Custom);
1625
1626 // NOTE: Currently this has to happen after computeRegisterProperties rather
1627 // than the preferred option of combining it with the addRegisterClass call.
1628 if (Subtarget->useSVEForFixedLengthVectors()) {
1631 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1632 addTypeForFixedLengthSVE(VT);
1633 }
1636 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1637 addTypeForFixedLengthSVE(VT);
1638 }
1639
1640 // 64bit results can mean a bigger than NEON input.
1641 for (auto VT : {MVT::v8i8, MVT::v4i16})
1644
1645 // 128bit results imply a bigger than NEON input.
1646 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1648 for (auto VT : {MVT::v8f16, MVT::v4f32})
1650
1651 // These operations are not supported on NEON but SVE can do them.
1653 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1654 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1655 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1656 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1657 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1658 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1659 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1660 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1661 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1662 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1663 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1664 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1665 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1666 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1667 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1672
1673 // Int operations with no NEON support.
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1675 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1683 }
1684
1685 // Use SVE for vectors with more than 2 elements.
1686 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1688 }
1689
1690 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1691 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1692 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1693 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1694
1696
1697 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1699 }
1700
1701 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1702 // Only required for llvm.aarch64.mops.memset.tag
1704 }
1705
1707
1708 if (Subtarget->hasSVE()) {
1713 }
1714
1715 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1716
1717 IsStrictFPEnabled = true;
1719
1720 if (Subtarget->isWindowsArm64EC()) {
1721 // FIXME: are there intrinsics we need to exclude from this?
1722 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1723 auto code = static_cast<RTLIB::Libcall>(i);
1724 auto libcallName = getLibcallName(code);
1725 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1726 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1727 }
1728 }
1729 }
1730}
1731
1732void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1733 assert(VT.isVector() && "VT should be a vector type");
1734
1735 if (VT.isFloatingPoint()) {
1737 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1738 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1739 }
1740
1741 // Mark vector float intrinsics as expand.
1742 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1752 }
1753
1754 // But we do support custom-lowering for FCOPYSIGN.
1755 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1756 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1757 VT == MVT::v8f16) &&
1758 Subtarget->hasFullFP16()))
1760
1773
1777 for (MVT InnerVT : MVT::all_valuetypes())
1778 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1779
1780 // CNT supports only B element sizes, then use UADDLP to widen.
1781 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1783
1789
1790 for (unsigned Opcode :
1793 setOperationAction(Opcode, VT, Custom);
1794
1795 if (!VT.isFloatingPoint())
1797
1798 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1799 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1800 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1801 setOperationAction(Opcode, VT, Legal);
1802
1803 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1804 // NEON types.
1805 if (VT.isFloatingPoint() &&
1806 VT.getVectorElementType() != MVT::bf16 &&
1807 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1808 for (unsigned Opcode :
1814 setOperationAction(Opcode, VT, Legal);
1815
1816 // Strict fp extend and trunc are legal
1817 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1819 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1821
1822 // FIXME: We could potentially make use of the vector comparison instructions
1823 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1824 // complications:
1825 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1826 // so we would need to expand when the condition code doesn't match the
1827 // kind of comparison.
1828 // * Some kinds of comparison require more than one FCMXY instruction so
1829 // would need to be expanded instead.
1830 // * The lowering of the non-strict versions involves target-specific ISD
1831 // nodes so we would likely need to add strict versions of all of them and
1832 // handle them appropriately.
1835
1836 if (Subtarget->isLittleEndian()) {
1837 for (unsigned im = (unsigned)ISD::PRE_INC;
1841 }
1842 }
1843
1844 if (Subtarget->hasD128()) {
1847 }
1848}
1849
1851 EVT OpVT) const {
1852 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1853 if (!Subtarget->hasSVE())
1854 return true;
1855
1856 // We can only support legal predicate result types. We can use the SVE
1857 // whilelo instruction for generating fixed-width predicates too.
1858 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1859 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1860 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1861 return true;
1862
1863 // The whilelo instruction only works with i32 or i64 scalar inputs.
1864 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1865 return true;
1866
1867 return false;
1868}
1869
1871 if (!Subtarget->hasSVEorSME())
1872 return true;
1873
1874 // We can only use the BRKB + CNTP sequence with legal predicate types.
1875 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1876 VT != MVT::nxv2i1;
1877}
1878
1879void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1880 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1881
1882 // By default everything must be expanded.
1883 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1885
1886 if (VT.isFloatingPoint()) {
1896 }
1897
1899 VT == MVT::v1f64 ? Expand : Custom;
1900
1901 // Mark integer truncating stores/extending loads as having custom lowering
1902 if (VT.isInteger()) {
1903 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1904 while (InnerVT != VT) {
1905 setTruncStoreAction(VT, InnerVT, Default);
1906 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1907 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1908 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1909 InnerVT = InnerVT.changeVectorElementType(
1910 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1911 }
1912 }
1913
1914 // Mark floating-point truncating stores/extending loads as having custom
1915 // lowering
1916 if (VT.isFloatingPoint()) {
1917 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1918 while (InnerVT != VT) {
1919 setTruncStoreAction(VT, InnerVT, Custom);
1920 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1921 InnerVT = InnerVT.changeVectorElementType(
1923 }
1924 }
1925
1926 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1927 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1928
1929 // Lower fixed length vector operations to scalable equivalents.
1934 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
1971 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
1972 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
1974 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
1993 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2019}
2020
2021void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
2022 addRegisterClass(VT, &AArch64::FPR64RegClass);
2023 addTypeForNEON(VT);
2024}
2025
2026void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2027 addRegisterClass(VT, &AArch64::FPR128RegClass);
2028 addTypeForNEON(VT);
2029}
2030
2032 LLVMContext &C, EVT VT) const {
2033 if (!VT.isVector())
2034 return MVT::i32;
2035 if (VT.isScalableVector())
2036 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2038}
2039
2040// isIntImmediate - This method tests to see if the node is a constant
2041// operand. If so Imm will receive the value.
2042static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2043 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2044 Imm = C->getZExtValue();
2045 return true;
2046 }
2047 return false;
2048}
2049
2050// isOpcWithIntImmediate - This method tests to see if the node is a specific
2051// opcode and that it has a immediate integer right operand.
2052// If so Imm will receive the value.
2053static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2054 uint64_t &Imm) {
2055 return N->getOpcode() == Opc &&
2056 isIntImmediate(N->getOperand(1).getNode(), Imm);
2057}
2058
2059static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2060 const APInt &Demanded,
2062 unsigned NewOpc) {
2063 uint64_t OldImm = Imm, NewImm, Enc;
2064 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2065
2066 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2067 // bimm64.
2068 if (Imm == 0 || Imm == Mask ||
2070 return false;
2071
2072 unsigned EltSize = Size;
2073 uint64_t DemandedBits = Demanded.getZExtValue();
2074
2075 // Clear bits that are not demanded.
2076 Imm &= DemandedBits;
2077
2078 while (true) {
2079 // The goal here is to set the non-demanded bits in a way that minimizes
2080 // the number of switching between 0 and 1. In order to achieve this goal,
2081 // we set the non-demanded bits to the value of the preceding demanded bits.
2082 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2083 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2084 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2085 // The final result is 0b11000011.
2086 uint64_t NonDemandedBits = ~DemandedBits;
2087 uint64_t InvertedImm = ~Imm & DemandedBits;
2088 uint64_t RotatedImm =
2089 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2090 NonDemandedBits;
2091 uint64_t Sum = RotatedImm + NonDemandedBits;
2092 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2093 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2094 NewImm = (Imm | Ones) & Mask;
2095
2096 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2097 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2098 // we halve the element size and continue the search.
2099 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2100 break;
2101
2102 // We cannot shrink the element size any further if it is 2-bits.
2103 if (EltSize == 2)
2104 return false;
2105
2106 EltSize /= 2;
2107 Mask >>= EltSize;
2108 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2109
2110 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2111 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2112 return false;
2113
2114 // Merge the upper and lower halves of Imm and DemandedBits.
2115 Imm |= Hi;
2116 DemandedBits |= DemandedBitsHi;
2117 }
2118
2119 ++NumOptimizedImms;
2120
2121 // Replicate the element across the register width.
2122 while (EltSize < Size) {
2123 NewImm |= NewImm << EltSize;
2124 EltSize *= 2;
2125 }
2126
2127 (void)OldImm;
2128 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2129 "demanded bits should never be altered");
2130 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2131
2132 // Create the new constant immediate node.
2133 EVT VT = Op.getValueType();
2134 SDLoc DL(Op);
2135 SDValue New;
2136
2137 // If the new constant immediate is all-zeros or all-ones, let the target
2138 // independent DAG combine optimize this node.
2139 if (NewImm == 0 || NewImm == OrigMask) {
2140 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2141 TLO.DAG.getConstant(NewImm, DL, VT));
2142 // Otherwise, create a machine node so that target independent DAG combine
2143 // doesn't undo this optimization.
2144 } else {
2146 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2147 New = SDValue(
2148 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2149 }
2150
2151 return TLO.CombineTo(Op, New);
2152}
2153
2155 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2156 TargetLoweringOpt &TLO) const {
2157 // Delay this optimization to as late as possible.
2158 if (!TLO.LegalOps)
2159 return false;
2160
2162 return false;
2163
2164 EVT VT = Op.getValueType();
2165 if (VT.isVector())
2166 return false;
2167
2168 unsigned Size = VT.getSizeInBits();
2169 assert((Size == 32 || Size == 64) &&
2170 "i32 or i64 is expected after legalization.");
2171
2172 // Exit early if we demand all bits.
2173 if (DemandedBits.popcount() == Size)
2174 return false;
2175
2176 unsigned NewOpc;
2177 switch (Op.getOpcode()) {
2178 default:
2179 return false;
2180 case ISD::AND:
2181 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2182 break;
2183 case ISD::OR:
2184 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2185 break;
2186 case ISD::XOR:
2187 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2188 break;
2189 }
2190 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2191 if (!C)
2192 return false;
2193 uint64_t Imm = C->getZExtValue();
2194 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2195}
2196
2197/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2198/// Mask are known to be either zero or one and return them Known.
2200 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2201 const SelectionDAG &DAG, unsigned Depth) const {
2202 switch (Op.getOpcode()) {
2203 default:
2204 break;
2205 case AArch64ISD::DUP: {
2206 SDValue SrcOp = Op.getOperand(0);
2207 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2208 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2209 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2210 "Expected DUP implicit truncation");
2211 Known = Known.trunc(Op.getScalarValueSizeInBits());
2212 }
2213 break;
2214 }
2215 case AArch64ISD::CSEL: {
2216 KnownBits Known2;
2217 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2218 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2219 Known = Known.intersectWith(Known2);
2220 break;
2221 }
2222 case AArch64ISD::BICi: {
2223 // Compute the bit cleared value.
2224 uint64_t Mask =
2225 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2226 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2227 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2228 break;
2229 }
2230 case AArch64ISD::VLSHR: {
2231 KnownBits Known2;
2232 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2233 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2234 Known = KnownBits::lshr(Known, Known2);
2235 break;
2236 }
2237 case AArch64ISD::VASHR: {
2238 KnownBits Known2;
2239 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2240 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2241 Known = KnownBits::ashr(Known, Known2);
2242 break;
2243 }
2244 case AArch64ISD::VSHL: {
2245 KnownBits Known2;
2246 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2247 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2248 Known = KnownBits::shl(Known, Known2);
2249 break;
2250 }
2251 case AArch64ISD::MOVI: {
2253 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2254 break;
2255 }
2257 case AArch64ISD::ADDlow: {
2258 if (!Subtarget->isTargetILP32())
2259 break;
2260 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2261 Known.Zero = APInt::getHighBitsSet(64, 32);
2262 break;
2263 }
2265 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2266 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2267 break;
2268 }
2270 Intrinsic::ID IntID =
2271 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2272 switch (IntID) {
2273 default: return;
2274 case Intrinsic::aarch64_ldaxr:
2275 case Intrinsic::aarch64_ldxr: {
2276 unsigned BitWidth = Known.getBitWidth();
2277 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2278 unsigned MemBits = VT.getScalarSizeInBits();
2279 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2280 return;
2281 }
2282 }
2283 break;
2284 }
2286 case ISD::INTRINSIC_VOID: {
2287 unsigned IntNo = Op.getConstantOperandVal(0);
2288 switch (IntNo) {
2289 default:
2290 break;
2291 case Intrinsic::aarch64_neon_uaddlv: {
2292 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2293 unsigned BitWidth = Known.getBitWidth();
2294 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2295 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2296 assert(BitWidth >= Bound && "Unexpected width!");
2298 Known.Zero |= Mask;
2299 }
2300 break;
2301 }
2302 case Intrinsic::aarch64_neon_umaxv:
2303 case Intrinsic::aarch64_neon_uminv: {
2304 // Figure out the datatype of the vector operand. The UMINV instruction
2305 // will zero extend the result, so we can mark as known zero all the
2306 // bits larger than the element datatype. 32-bit or larget doesn't need
2307 // this as those are legal types and will be handled by isel directly.
2308 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2309 unsigned BitWidth = Known.getBitWidth();
2310 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2311 assert(BitWidth >= 8 && "Unexpected width!");
2313 Known.Zero |= Mask;
2314 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2315 assert(BitWidth >= 16 && "Unexpected width!");
2317 Known.Zero |= Mask;
2318 }
2319 break;
2320 } break;
2321 }
2322 }
2323 }
2324}
2325
2327 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2328 unsigned Depth) const {
2329 EVT VT = Op.getValueType();
2330 unsigned VTBits = VT.getScalarSizeInBits();
2331 unsigned Opcode = Op.getOpcode();
2332 switch (Opcode) {
2333 case AArch64ISD::CMEQ:
2334 case AArch64ISD::CMGE:
2335 case AArch64ISD::CMGT:
2336 case AArch64ISD::CMHI:
2337 case AArch64ISD::CMHS:
2338 case AArch64ISD::FCMEQ:
2339 case AArch64ISD::FCMGE:
2340 case AArch64ISD::FCMGT:
2341 case AArch64ISD::CMEQz:
2342 case AArch64ISD::CMGEz:
2343 case AArch64ISD::CMGTz:
2344 case AArch64ISD::CMLEz:
2345 case AArch64ISD::CMLTz:
2346 case AArch64ISD::FCMEQz:
2347 case AArch64ISD::FCMGEz:
2348 case AArch64ISD::FCMGTz:
2349 case AArch64ISD::FCMLEz:
2350 case AArch64ISD::FCMLTz:
2351 // Compares return either 0 or all-ones
2352 return VTBits;
2353 }
2354
2355 return 1;
2356}
2357
2359 EVT) const {
2360 return MVT::i64;
2361}
2362
2364 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2365 unsigned *Fast) const {
2366 if (Subtarget->requiresStrictAlign())
2367 return false;
2368
2369 if (Fast) {
2370 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2371 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2372 // See comments in performSTORECombine() for more details about
2373 // these conditions.
2374
2375 // Code that uses clang vector extensions can mark that it
2376 // wants unaligned accesses to be treated as fast by
2377 // underspecifying alignment to be 1 or 2.
2378 Alignment <= 2 ||
2379
2380 // Disregard v2i64. Memcpy lowering produces those and splitting
2381 // them regresses performance on micro-benchmarks and olden/bh.
2382 VT == MVT::v2i64;
2383 }
2384 return true;
2385}
2386
2387// Same as above but handling LLTs instead.
2389 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2390 unsigned *Fast) const {
2391 if (Subtarget->requiresStrictAlign())
2392 return false;
2393
2394 if (Fast) {
2395 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2396 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2397 Ty.getSizeInBytes() != 16 ||
2398 // See comments in performSTORECombine() for more details about
2399 // these conditions.
2400
2401 // Code that uses clang vector extensions can mark that it
2402 // wants unaligned accesses to be treated as fast by
2403 // underspecifying alignment to be 1 or 2.
2404 Alignment <= 2 ||
2405
2406 // Disregard v2i64. Memcpy lowering produces those and splitting
2407 // them regresses performance on micro-benchmarks and olden/bh.
2408 Ty == LLT::fixed_vector(2, 64);
2409 }
2410 return true;
2411}
2412
2413FastISel *
2415 const TargetLibraryInfo *libInfo) const {
2416 return AArch64::createFastISel(funcInfo, libInfo);
2417}
2418
2419const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2420#define MAKE_CASE(V) \
2421 case V: \
2422 return #V;
2423 switch ((AArch64ISD::NodeType)Opcode) {
2425 break;
2742 }
2743#undef MAKE_CASE
2744 return nullptr;
2745}
2746
2749 MachineBasicBlock *MBB) const {
2750 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2751 // phi node:
2752
2753 // OrigBB:
2754 // [... previous instrs leading to comparison ...]
2755 // b.ne TrueBB
2756 // b EndBB
2757 // TrueBB:
2758 // ; Fallthrough
2759 // EndBB:
2760 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2761
2762 MachineFunction *MF = MBB->getParent();
2763 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2764 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2765 DebugLoc DL = MI.getDebugLoc();
2767
2768 Register DestReg = MI.getOperand(0).getReg();
2769 Register IfTrueReg = MI.getOperand(1).getReg();
2770 Register IfFalseReg = MI.getOperand(2).getReg();
2771 unsigned CondCode = MI.getOperand(3).getImm();
2772 bool NZCVKilled = MI.getOperand(4).isKill();
2773
2774 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2775 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2776 MF->insert(It, TrueBB);
2777 MF->insert(It, EndBB);
2778
2779 // Transfer rest of current basic-block to EndBB
2780 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2781 MBB->end());
2783
2784 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2785 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2786 MBB->addSuccessor(TrueBB);
2787 MBB->addSuccessor(EndBB);
2788
2789 // TrueBB falls through to the end.
2790 TrueBB->addSuccessor(EndBB);
2791
2792 if (!NZCVKilled) {
2793 TrueBB->addLiveIn(AArch64::NZCV);
2794 EndBB->addLiveIn(AArch64::NZCV);
2795 }
2796
2797 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2798 .addReg(IfTrueReg)
2799 .addMBB(TrueBB)
2800 .addReg(IfFalseReg)
2801 .addMBB(MBB);
2802
2803 MI.eraseFromParent();
2804 return EndBB;
2805}
2806
2808 MachineInstr &MI, MachineBasicBlock *BB) const {
2810 BB->getParent()->getFunction().getPersonalityFn())) &&
2811 "SEH does not use catchret!");
2812 return BB;
2813}
2814
2817 MachineBasicBlock *MBB) const {
2818 MachineFunction &MF = *MBB->getParent();
2819 MachineBasicBlock::iterator MBBI = MI.getIterator();
2821 const AArch64InstrInfo &TII =
2822 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2823 Register TargetReg = MI.getOperand(0).getReg();
2825 TII.probedStackAlloc(MBBI, TargetReg, false);
2826
2827 MI.eraseFromParent();
2828 return NextInst->getParent();
2829}
2830
2832AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2834 MachineBasicBlock *BB) const {
2835 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2836 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2837
2838 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2839 MIB.add(MI.getOperand(1)); // slice index register
2840 MIB.add(MI.getOperand(2)); // slice index offset
2841 MIB.add(MI.getOperand(3)); // pg
2842 MIB.add(MI.getOperand(4)); // base
2843 MIB.add(MI.getOperand(5)); // offset
2844
2845 MI.eraseFromParent(); // The pseudo is gone now.
2846 return BB;
2847}
2848
2851 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2853 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2854
2855 MIB.addReg(AArch64::ZA, RegState::Define);
2856 MIB.add(MI.getOperand(0)); // Vector select register
2857 MIB.add(MI.getOperand(1)); // Vector select offset
2858 MIB.add(MI.getOperand(2)); // Base
2859 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2860
2861 MI.eraseFromParent(); // The pseudo is gone now.
2862 return BB;
2863}
2864
2867 unsigned Opcode,
2868 bool Op0IsDef) const {
2869 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2871
2872 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2873 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2874 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2875 MIB.add(MI.getOperand(I));
2876
2877 MI.eraseFromParent(); // The pseudo is gone now.
2878 return BB;
2879}
2880
2882AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2884 MachineBasicBlock *BB, bool HasTile) const {
2885 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2886 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2887 unsigned StartIdx = 0;
2888
2889 if (HasTile) {
2890 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2891 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2892 StartIdx = 1;
2893 } else
2894 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2895
2896 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2897 MIB.add(MI.getOperand(I));
2898
2899 MI.eraseFromParent(); // The pseudo is gone now.
2900 return BB;
2901}
2902
2905 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2907 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2908 MIB.add(MI.getOperand(0)); // Mask
2909
2910 unsigned Mask = MI.getOperand(0).getImm();
2911 for (unsigned I = 0; I < 8; I++) {
2912 if (Mask & (1 << I))
2913 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2914 }
2915
2916 MI.eraseFromParent(); // The pseudo is gone now.
2917 return BB;
2918}
2919
2921 MachineInstr &MI, MachineBasicBlock *BB) const {
2922
2923 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2924 if (SMEOrigInstr != -1) {
2925 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2926 uint64_t SMEMatrixType =
2927 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2928 switch (SMEMatrixType) {
2930 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2932 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2934 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2936 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2938 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2940 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2941 }
2942 }
2943
2944 switch (MI.getOpcode()) {
2945 default:
2946#ifndef NDEBUG
2947 MI.dump();
2948#endif
2949 llvm_unreachable("Unexpected instruction for custom inserter!");
2950
2951 case AArch64::F128CSEL:
2952 return EmitF128CSEL(MI, BB);
2953 case TargetOpcode::STATEPOINT:
2954 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2955 // while bl call instruction (where statepoint will be lowered at the end)
2956 // has implicit def. This def is early-clobber as it will be set at
2957 // the moment of the call and earlier than any use is read.
2958 // Add this implicit dead def here as a workaround.
2959 MI.addOperand(*MI.getMF(),
2961 AArch64::LR, /*isDef*/ true,
2962 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2963 /*isUndef*/ false, /*isEarlyClobber*/ true));
2964 [[fallthrough]];
2965 case TargetOpcode::STACKMAP:
2966 case TargetOpcode::PATCHPOINT:
2967 return emitPatchPoint(MI, BB);
2968
2969 case TargetOpcode::PATCHABLE_EVENT_CALL:
2970 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2971 return BB;
2972
2973 case AArch64::CATCHRET:
2974 return EmitLoweredCatchRet(MI, BB);
2975
2976 case AArch64::PROBED_STACKALLOC_DYN:
2977 return EmitDynamicProbedAlloc(MI, BB);
2978
2979 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2980 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2981 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2982 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2983 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2984 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2985 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2986 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2987 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2988 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2989 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2990 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2991 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2992 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2993 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2994 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2995 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2996 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2997 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2998 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2999 case AArch64::LDR_ZA_PSEUDO:
3000 return EmitFill(MI, BB);
3001 case AArch64::LDR_TX_PSEUDO:
3002 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3003 case AArch64::STR_TX_PSEUDO:
3004 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3005 case AArch64::ZERO_M_PSEUDO:
3006 return EmitZero(MI, BB);
3007 case AArch64::ZERO_T_PSEUDO:
3008 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3009 }
3010}
3011
3012//===----------------------------------------------------------------------===//
3013// AArch64 Lowering private implementation.
3014//===----------------------------------------------------------------------===//
3015
3016//===----------------------------------------------------------------------===//
3017// Lowering Code
3018//===----------------------------------------------------------------------===//
3019
3020// Forward declarations of SVE fixed length lowering helpers
3025 SelectionDAG &DAG);
3028 EVT VT);
3029
3030/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3031static bool isZerosVector(const SDNode *N) {
3032 // Look through a bit convert.
3033 while (N->getOpcode() == ISD::BITCAST)
3034 N = N->getOperand(0).getNode();
3035
3037 return true;
3038
3039 if (N->getOpcode() != AArch64ISD::DUP)
3040 return false;
3041
3042 auto Opnd0 = N->getOperand(0);
3043 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3044}
3045
3046/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3047/// CC
3049 switch (CC) {
3050 default:
3051 llvm_unreachable("Unknown condition code!");
3052 case ISD::SETNE:
3053 return AArch64CC::NE;
3054 case ISD::SETEQ:
3055 return AArch64CC::EQ;
3056 case ISD::SETGT:
3057 return AArch64CC::GT;
3058 case ISD::SETGE:
3059 return AArch64CC::GE;
3060 case ISD::SETLT:
3061 return AArch64CC::LT;
3062 case ISD::SETLE:
3063 return AArch64CC::LE;
3064 case ISD::SETUGT:
3065 return AArch64CC::HI;
3066 case ISD::SETUGE:
3067 return AArch64CC::HS;
3068 case ISD::SETULT:
3069 return AArch64CC::LO;
3070 case ISD::SETULE:
3071 return AArch64CC::LS;
3072 }
3073}
3074
3075/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3077 AArch64CC::CondCode &CondCode,
3078 AArch64CC::CondCode &CondCode2) {
3079 CondCode2 = AArch64CC::AL;
3080 switch (CC) {
3081 default:
3082 llvm_unreachable("Unknown FP condition!");
3083 case ISD::SETEQ:
3084 case ISD::SETOEQ:
3085 CondCode = AArch64CC::EQ;
3086 break;
3087 case ISD::SETGT:
3088 case ISD::SETOGT:
3089 CondCode = AArch64CC::GT;
3090 break;
3091 case ISD::SETGE:
3092 case ISD::SETOGE:
3093 CondCode = AArch64CC::GE;
3094 break;
3095 case ISD::SETOLT:
3096 CondCode = AArch64CC::MI;
3097 break;
3098 case ISD::SETOLE:
3099 CondCode = AArch64CC::LS;
3100 break;
3101 case ISD::SETONE:
3102 CondCode = AArch64CC::MI;
3103 CondCode2 = AArch64CC::GT;
3104 break;
3105 case ISD::SETO:
3106 CondCode = AArch64CC::VC;
3107 break;
3108 case ISD::SETUO:
3109 CondCode = AArch64CC::VS;
3110 break;
3111 case ISD::SETUEQ:
3112 CondCode = AArch64CC::EQ;
3113 CondCode2 = AArch64CC::VS;
3114 break;
3115 case ISD::SETUGT:
3116 CondCode = AArch64CC::HI;
3117 break;
3118 case ISD::SETUGE:
3119 CondCode = AArch64CC::PL;
3120 break;
3121 case ISD::SETLT:
3122 case ISD::SETULT:
3123 CondCode = AArch64CC::LT;
3124 break;
3125 case ISD::SETLE:
3126 case ISD::SETULE:
3127 CondCode = AArch64CC::LE;
3128 break;
3129 case ISD::SETNE:
3130 case ISD::SETUNE:
3131 CondCode = AArch64CC::NE;
3132 break;
3133 }
3134}
3135
3136/// Convert a DAG fp condition code to an AArch64 CC.
3137/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3138/// should be AND'ed instead of OR'ed.
3140 AArch64CC::CondCode &CondCode,
3141 AArch64CC::CondCode &CondCode2) {
3142 CondCode2 = AArch64CC::AL;
3143 switch (CC) {
3144 default:
3145 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3146 assert(CondCode2 == AArch64CC::AL);
3147 break;
3148 case ISD::SETONE:
3149 // (a one b)
3150 // == ((a olt b) || (a ogt b))
3151 // == ((a ord b) && (a une b))
3152 CondCode = AArch64CC::VC;
3153 CondCode2 = AArch64CC::NE;
3154 break;
3155 case ISD::SETUEQ:
3156 // (a ueq b)
3157 // == ((a uno b) || (a oeq b))
3158 // == ((a ule b) && (a uge b))
3159 CondCode = AArch64CC::PL;
3160 CondCode2 = AArch64CC::LE;
3161 break;
3162 }
3163}
3164
3165/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3166/// CC usable with the vector instructions. Fewer operations are available
3167/// without a real NZCV register, so we have to use less efficient combinations
3168/// to get the same effect.
3170 AArch64CC::CondCode &CondCode,
3171 AArch64CC::CondCode &CondCode2,
3172 bool &Invert) {
3173 Invert = false;
3174 switch (CC) {
3175 default:
3176 // Mostly the scalar mappings work fine.
3177 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3178 break;
3179 case ISD::SETUO:
3180 Invert = true;
3181 [[fallthrough]];
3182 case ISD::SETO:
3183 CondCode = AArch64CC::MI;
3184 CondCode2 = AArch64CC::GE;
3185 break;
3186 case ISD::SETUEQ:
3187 case ISD::SETULT:
3188 case ISD::SETULE:
3189 case ISD::SETUGT:
3190 case ISD::SETUGE:
3191 // All of the compare-mask comparisons are ordered, but we can switch
3192 // between the two by a double inversion. E.g. ULE == !OGT.
3193 Invert = true;
3194 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3195 CondCode, CondCode2);
3196 break;
3197 }
3198}
3199
3201 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3202 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3203 LLVM_DEBUG(dbgs() << "Is imm " << C
3204 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3205 return IsLegal;
3206}
3207
3208// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3209// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3210// can be set differently by this operation. It comes down to whether
3211// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3212// everything is fine. If not then the optimization is wrong. Thus general
3213// comparisons are only valid if op2 != 0.
3214//
3215// So, finally, the only LLVM-native comparisons that don't mention C and V
3216// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3217// the absence of information about op2.
3219 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3220 (CC == ISD::SETEQ || CC == ISD::SETNE);
3221}
3222
3224 SelectionDAG &DAG, SDValue Chain,
3225 bool IsSignaling) {
3226 EVT VT = LHS.getValueType();
3227 assert(VT != MVT::f128);
3228
3229 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3230
3231 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3232 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3233 {Chain, LHS});
3234 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3235 {LHS.getValue(1), RHS});
3236 Chain = RHS.getValue(1);
3237 VT = MVT::f32;
3238 }
3239 unsigned Opcode =
3241 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3242}
3243
3245 const SDLoc &dl, SelectionDAG &DAG) {
3246 EVT VT = LHS.getValueType();
3247 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3248
3249 if (VT.isFloatingPoint()) {
3250 assert(VT != MVT::f128);
3251 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3252 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3253 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3254 VT = MVT::f32;
3255 }
3256 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3257 }
3258
3259 // The CMP instruction is just an alias for SUBS, and representing it as
3260 // SUBS means that it's possible to get CSE with subtract operations.
3261 // A later phase can perform the optimization of setting the destination
3262 // register to WZR/XZR if it ends up being unused.
3263 unsigned Opcode = AArch64ISD::SUBS;
3264
3265 if (isCMN(RHS, CC)) {
3266 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3267 Opcode = AArch64ISD::ADDS;
3268 RHS = RHS.getOperand(1);
3269 } else if (isCMN(LHS, CC)) {
3270 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3271 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3272 Opcode = AArch64ISD::ADDS;
3273 LHS = LHS.getOperand(1);
3274 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3275 if (LHS.getOpcode() == ISD::AND) {
3276 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3277 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3278 // of the signed comparisons.
3279 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3280 DAG.getVTList(VT, MVT_CC),
3281 LHS.getOperand(0),
3282 LHS.getOperand(1));
3283 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3284 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3285 return ANDSNode.getValue(1);
3286 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3287 // Use result of ANDS
3288 return LHS.getValue(1);
3289 }
3290 }
3291
3292 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3293 .getValue(1);
3294}
3295
3296/// \defgroup AArch64CCMP CMP;CCMP matching
3297///
3298/// These functions deal with the formation of CMP;CCMP;... sequences.
3299/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3300/// a comparison. They set the NZCV flags to a predefined value if their
3301/// predicate is false. This allows to express arbitrary conjunctions, for
3302/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3303/// expressed as:
3304/// cmp A
3305/// ccmp B, inv(CB), CA
3306/// check for CB flags
3307///
3308/// This naturally lets us implement chains of AND operations with SETCC
3309/// operands. And we can even implement some other situations by transforming
3310/// them:
3311/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3312/// negating the flags used in a CCMP/FCCMP operations.
3313/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3314/// by negating the flags we test for afterwards. i.e.
3315/// NEG (CMP CCMP CCCMP ...) can be implemented.
3316/// - Note that we can only ever negate all previously processed results.
3317/// What we can not implement by flipping the flags to test is a negation
3318/// of two sub-trees (because the negation affects all sub-trees emitted so
3319/// far, so the 2nd sub-tree we emit would also affect the first).
3320/// With those tools we can implement some OR operations:
3321/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3322/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3323/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3324/// elimination rules from earlier to implement the whole thing as a
3325/// CCMP/FCCMP chain.
3326///
3327/// As complete example:
3328/// or (or (setCA (cmp A)) (setCB (cmp B)))
3329/// (and (setCC (cmp C)) (setCD (cmp D)))"
3330/// can be reassociated to:
3331/// or (and (setCC (cmp C)) setCD (cmp D))
3332// (or (setCA (cmp A)) (setCB (cmp B)))
3333/// can be transformed to:
3334/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3335/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3336/// which can be implemented as:
3337/// cmp C
3338/// ccmp D, inv(CD), CC
3339/// ccmp A, CA, inv(CD)
3340/// ccmp B, CB, inv(CA)
3341/// check for CB flags
3342///
3343/// A counterexample is "or (and A B) (and C D)" which translates to
3344/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3345/// can only implement 1 of the inner (not) operations, but not both!
3346/// @{
3347
3348/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3350 ISD::CondCode CC, SDValue CCOp,
3351 AArch64CC::CondCode Predicate,
3352 AArch64CC::CondCode OutCC,
3353 const SDLoc &DL, SelectionDAG &DAG) {
3354 unsigned Opcode = 0;
3355 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3356
3357 if (LHS.getValueType().isFloatingPoint()) {
3358 assert(LHS.getValueType() != MVT::f128);
3359 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3360 LHS.getValueType() == MVT::bf16) {
3361 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3362 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3363 }
3364 Opcode = AArch64ISD::FCCMP;
3365 } else if (RHS.getOpcode() == ISD::SUB) {
3366 SDValue SubOp0 = RHS.getOperand(0);
3367 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3368 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3369 Opcode = AArch64ISD::CCMN;
3370 RHS = RHS.getOperand(1);
3371 }
3372 }
3373 if (Opcode == 0)
3374 Opcode = AArch64ISD::CCMP;
3375
3376 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3378 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3379 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3380 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3381}
3382
3383/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3384/// expressed as a conjunction. See \ref AArch64CCMP.
3385/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3386/// changing the conditions on the SETCC tests.
3387/// (this means we can call emitConjunctionRec() with
3388/// Negate==true on this sub-tree)
3389/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3390/// cannot do the negation naturally. We are required to
3391/// emit the subtree first in this case.
3392/// \param WillNegate Is true if are called when the result of this
3393/// subexpression must be negated. This happens when the
3394/// outer expression is an OR. We can use this fact to know
3395/// that we have a double negation (or (or ...) ...) that
3396/// can be implemented for free.
3397static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3398 bool &MustBeFirst, bool WillNegate,
3399 unsigned Depth = 0) {
3400 if (!Val.hasOneUse())
3401 return false;
3402 unsigned Opcode = Val->getOpcode();
3403 if (Opcode == ISD::SETCC) {
3404 if (Val->getOperand(0).getValueType() == MVT::f128)
3405 return false;
3406 CanNegate = true;
3407 MustBeFirst = false;
3408 return true;
3409 }
3410 // Protect against exponential runtime and stack overflow.
3411 if (Depth > 6)
3412 return false;
3413 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3414 bool IsOR = Opcode == ISD::OR;
3415 SDValue O0 = Val->getOperand(0);
3416 SDValue O1 = Val->getOperand(1);
3417 bool CanNegateL;
3418 bool MustBeFirstL;
3419 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3420 return false;
3421 bool CanNegateR;
3422 bool MustBeFirstR;
3423 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3424 return false;
3425
3426 if (MustBeFirstL && MustBeFirstR)
3427 return false;
3428
3429 if (IsOR) {
3430 // For an OR expression we need to be able to naturally negate at least
3431 // one side or we cannot do the transformation at all.
3432 if (!CanNegateL && !CanNegateR)
3433 return false;
3434 // If we the result of the OR will be negated and we can naturally negate
3435 // the leafs, then this sub-tree as a whole negates naturally.
3436 CanNegate = WillNegate && CanNegateL && CanNegateR;
3437 // If we cannot naturally negate the whole sub-tree, then this must be
3438 // emitted first.
3439 MustBeFirst = !CanNegate;
3440 } else {
3441 assert(Opcode == ISD::AND && "Must be OR or AND");
3442 // We cannot naturally negate an AND operation.
3443 CanNegate = false;
3444 MustBeFirst = MustBeFirstL || MustBeFirstR;
3445 }
3446 return true;
3447 }
3448 return false;
3449}
3450
3451/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3452/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3453/// Tries to transform the given i1 producing node @p Val to a series compare
3454/// and conditional compare operations. @returns an NZCV flags producing node
3455/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3456/// transformation was not possible.
3457/// \p Negate is true if we want this sub-tree being negated just by changing
3458/// SETCC conditions.
3460 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3461 AArch64CC::CondCode Predicate) {
3462 // We're at a tree leaf, produce a conditional comparison operation.
3463 unsigned Opcode = Val->getOpcode();
3464 if (Opcode == ISD::SETCC) {
3465 SDValue LHS = Val->getOperand(0);
3466 SDValue RHS = Val->getOperand(1);
3467 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3468 bool isInteger = LHS.getValueType().isInteger();
3469 if (Negate)
3470 CC = getSetCCInverse(CC, LHS.getValueType());
3471 SDLoc DL(Val);
3472 // Determine OutCC and handle FP special case.
3473 if (isInteger) {
3474 OutCC = changeIntCCToAArch64CC(CC);
3475 } else {
3476 assert(LHS.getValueType().isFloatingPoint());
3477 AArch64CC::CondCode ExtraCC;
3478 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3479 // Some floating point conditions can't be tested with a single condition
3480 // code. Construct an additional comparison in this case.
3481 if (ExtraCC != AArch64CC::AL) {
3482 SDValue ExtraCmp;
3483 if (!CCOp.getNode())
3484 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3485 else
3486 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3487 ExtraCC, DL, DAG);
3488 CCOp = ExtraCmp;
3489 Predicate = ExtraCC;
3490 }
3491 }
3492
3493 // Produce a normal comparison if we are first in the chain
3494 if (!CCOp)
3495 return emitComparison(LHS, RHS, CC, DL, DAG);
3496 // Otherwise produce a ccmp.
3497 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3498 DAG);
3499 }
3500 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3501
3502 bool IsOR = Opcode == ISD::OR;
3503
3504 SDValue LHS = Val->getOperand(0);
3505 bool CanNegateL;
3506 bool MustBeFirstL;
3507 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3508 assert(ValidL && "Valid conjunction/disjunction tree");
3509 (void)ValidL;
3510
3511 SDValue RHS = Val->getOperand(1);
3512 bool CanNegateR;
3513 bool MustBeFirstR;
3514 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3515 assert(ValidR && "Valid conjunction/disjunction tree");
3516 (void)ValidR;
3517
3518 // Swap sub-tree that must come first to the right side.
3519 if (MustBeFirstL) {
3520 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3521 std::swap(LHS, RHS);
3522 std::swap(CanNegateL, CanNegateR);
3523 std::swap(MustBeFirstL, MustBeFirstR);
3524 }
3525
3526 bool NegateR;
3527 bool NegateAfterR;
3528 bool NegateL;
3529 bool NegateAfterAll;
3530 if (Opcode == ISD::OR) {
3531 // Swap the sub-tree that we can negate naturally to the left.
3532 if (!CanNegateL) {
3533 assert(CanNegateR && "at least one side must be negatable");
3534 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3535 assert(!Negate);
3536 std::swap(LHS, RHS);
3537 NegateR = false;
3538 NegateAfterR = true;
3539 } else {
3540 // Negate the left sub-tree if possible, otherwise negate the result.
3541 NegateR = CanNegateR;
3542 NegateAfterR = !CanNegateR;
3543 }
3544 NegateL = true;
3545 NegateAfterAll = !Negate;
3546 } else {
3547 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3548 assert(!Negate && "Valid conjunction/disjunction tree");
3549
3550 NegateL = false;
3551 NegateR = false;
3552 NegateAfterR = false;
3553 NegateAfterAll = false;
3554 }
3555
3556 // Emit sub-trees.
3557 AArch64CC::CondCode RHSCC;
3558 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3559 if (NegateAfterR)
3560 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3561 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3562 if (NegateAfterAll)
3563 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3564 return CmpL;
3565}
3566
3567/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3568/// In some cases this is even possible with OR operations in the expression.
3569/// See \ref AArch64CCMP.
3570/// \see emitConjunctionRec().
3572 AArch64CC::CondCode &OutCC) {
3573 bool DummyCanNegate;
3574 bool DummyMustBeFirst;
3575 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3576 return SDValue();
3577
3578 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3579}
3580
3581/// @}
3582
3583/// Returns how profitable it is to fold a comparison's operand's shift and/or
3584/// extension operations.
3586 auto isSupportedExtend = [&](SDValue V) {
3587 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3588 return true;
3589
3590 if (V.getOpcode() == ISD::AND)
3591 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3592 uint64_t Mask = MaskCst->getZExtValue();
3593 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3594 }
3595
3596 return false;
3597 };
3598
3599 if (!Op.hasOneUse())
3600 return 0;
3601
3602 if (isSupportedExtend(Op))
3603 return 1;
3604
3605 unsigned Opc = Op.getOpcode();
3606 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3607 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3608 uint64_t Shift = ShiftCst->getZExtValue();
3609 if (isSupportedExtend(Op.getOperand(0)))
3610 return (Shift <= 4) ? 2 : 1;
3611 EVT VT = Op.getValueType();
3612 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3613 return 1;
3614 }
3615
3616 return 0;
3617}
3618
3620 SDValue &AArch64cc, SelectionDAG &DAG,
3621 const SDLoc &dl) {
3622 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3623 EVT VT = RHS.getValueType();
3624 uint64_t C = RHSC->getZExtValue();
3625 if (!isLegalArithImmed(C)) {
3626 // Constant does not fit, try adjusting it by one?
3627 switch (CC) {
3628 default:
3629 break;
3630 case ISD::SETLT:
3631 case ISD::SETGE:
3632 if ((VT == MVT::i32 && C != 0x80000000 &&
3633 isLegalArithImmed((uint32_t)(C - 1))) ||
3634 (VT == MVT::i64 && C != 0x80000000ULL &&
3635 isLegalArithImmed(C - 1ULL))) {
3637 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3638 RHS = DAG.getConstant(C, dl, VT);
3639 }
3640 break;
3641 case ISD::SETULT:
3642 case ISD::SETUGE:
3643 if ((VT == MVT::i32 && C != 0 &&
3644 isLegalArithImmed((uint32_t)(C - 1))) ||
3645 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3647 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3648 RHS = DAG.getConstant(C, dl, VT);
3649 }
3650 break;
3651 case ISD::SETLE:
3652 case ISD::SETGT:
3653 if ((VT == MVT::i32 && C != INT32_MAX &&
3654 isLegalArithImmed((uint32_t)(C + 1))) ||
3655 (VT == MVT::i64 && C != INT64_MAX &&
3656 isLegalArithImmed(C + 1ULL))) {
3658 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3659 RHS = DAG.getConstant(C, dl, VT);
3660 }
3661 break;
3662 case ISD::SETULE:
3663 case ISD::SETUGT:
3664 if ((VT == MVT::i32 && C != UINT32_MAX &&
3665 isLegalArithImmed((uint32_t)(C + 1))) ||
3666 (VT == MVT::i64 && C != UINT64_MAX &&
3667 isLegalArithImmed(C + 1ULL))) {
3669 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3670 RHS = DAG.getConstant(C, dl, VT);
3671 }
3672 break;
3673 }
3674 }
3675 }
3676
3677 // Comparisons are canonicalized so that the RHS operand is simpler than the
3678 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3679 // can fold some shift+extend operations on the RHS operand, so swap the
3680 // operands if that can be done.
3681 //
3682 // For example:
3683 // lsl w13, w11, #1
3684 // cmp w13, w12
3685 // can be turned into:
3686 // cmp w12, w11, lsl #1
3687 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3688 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3689
3691 std::swap(LHS, RHS);
3693 }
3694 }
3695
3696 SDValue Cmp;
3697 AArch64CC::CondCode AArch64CC;
3698 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3699 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3700
3701 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3702 // For the i8 operand, the largest immediate is 255, so this can be easily
3703 // encoded in the compare instruction. For the i16 operand, however, the
3704 // largest immediate cannot be encoded in the compare.
3705 // Therefore, use a sign extending load and cmn to avoid materializing the
3706 // -1 constant. For example,
3707 // movz w1, #65535
3708 // ldrh w0, [x0, #0]
3709 // cmp w0, w1
3710 // >
3711 // ldrsh w0, [x0, #0]
3712 // cmn w0, #1
3713 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3714 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3715 // ensure both the LHS and RHS are truly zero extended and to make sure the
3716 // transformation is profitable.
3717 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3718 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3719 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3720 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3721 int16_t ValueofRHS = RHS->getAsZExtVal();
3722 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3723 SDValue SExt =
3724 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3725 DAG.getValueType(MVT::i16));
3726 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3727 RHS.getValueType()),
3728 CC, dl, DAG);
3729 AArch64CC = changeIntCCToAArch64CC(CC);
3730 }
3731 }
3732
3733 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3734 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3735 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3736 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3737 }
3738 }
3739 }
3740
3741 if (!Cmp) {
3742 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3743 AArch64CC = changeIntCCToAArch64CC(CC);
3744 }
3745 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3746 return Cmp;
3747}
3748
3749static std::pair<SDValue, SDValue>
3751 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3752 "Unsupported value type");
3753 SDValue Value, Overflow;
3754 SDLoc DL(Op);
3755 SDValue LHS = Op.getOperand(0);
3756 SDValue RHS = Op.getOperand(1);
3757 unsigned Opc = 0;
3758 switch (Op.getOpcode()) {
3759 default:
3760 llvm_unreachable("Unknown overflow instruction!");
3761 case ISD::SADDO:
3762 Opc = AArch64ISD::ADDS;
3763 CC = AArch64CC::VS;
3764 break;
3765 case ISD::UADDO:
3766 Opc = AArch64ISD::ADDS;
3767 CC = AArch64CC::HS;
3768 break;
3769 case ISD::SSUBO:
3770 Opc = AArch64ISD::SUBS;
3771 CC = AArch64CC::VS;
3772 break;
3773 case ISD::USUBO:
3774 Opc = AArch64ISD::SUBS;
3775 CC = AArch64CC::LO;
3776 break;
3777 // Multiply needs a little bit extra work.
3778 case ISD::SMULO:
3779 case ISD::UMULO: {
3780 CC = AArch64CC::NE;
3781 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3782 if (Op.getValueType() == MVT::i32) {
3783 // Extend to 64-bits, then perform a 64-bit multiply.
3784 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3785 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3786 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3787 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3788 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3789
3790 // Check that the result fits into a 32-bit integer.
3791 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3792 if (IsSigned) {
3793 // cmp xreg, wreg, sxtw
3794 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3795 Overflow =
3796 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3797 } else {
3798 // tst xreg, #0xffffffff00000000
3799 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3800 Overflow =
3801 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3802 }
3803 break;
3804 }
3805 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3806 // For the 64 bit multiply
3807 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3808 if (IsSigned) {
3809 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3810 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3811 DAG.getConstant(63, DL, MVT::i64));
3812 // It is important that LowerBits is last, otherwise the arithmetic
3813 // shift will not be folded into the compare (SUBS).
3814 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3815 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3816 .getValue(1);
3817 } else {
3818 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3819 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3820 Overflow =
3821 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3822 DAG.getConstant(0, DL, MVT::i64),
3823 UpperBits).getValue(1);
3824 }
3825 break;
3826 }
3827 } // switch (...)
3828
3829 if (Opc) {
3830 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3831
3832 // Emit the AArch64 operation with overflow check.
3833 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3834 Overflow = Value.getValue(1);
3835 }
3836 return std::make_pair(Value, Overflow);
3837}
3838
3839SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3840 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3841 !Subtarget->isNeonAvailable()))
3842 return LowerToScalableOp(Op, DAG);
3843
3844 SDValue Sel = Op.getOperand(0);
3845 SDValue Other = Op.getOperand(1);
3846 SDLoc dl(Sel);
3847
3848 // If the operand is an overflow checking operation, invert the condition
3849 // code and kill the Not operation. I.e., transform:
3850 // (xor (overflow_op_bool, 1))
3851 // -->
3852 // (csel 1, 0, invert(cc), overflow_op_bool)
3853 // ... which later gets transformed to just a cset instruction with an
3854 // inverted condition code, rather than a cset + eor sequence.
3856 // Only lower legal XALUO ops.
3858 return SDValue();
3859
3860 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3861 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3863 SDValue Value, Overflow;
3864 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3865 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3866 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3867 CCVal, Overflow);
3868 }
3869 // If neither operand is a SELECT_CC, give up.
3870 if (Sel.getOpcode() != ISD::SELECT_CC)
3871 std::swap(Sel, Other);
3872 if (Sel.getOpcode() != ISD::SELECT_CC)
3873 return Op;
3874
3875 // The folding we want to perform is:
3876 // (xor x, (select_cc a, b, cc, 0, -1) )
3877 // -->
3878 // (csel x, (xor x, -1), cc ...)
3879 //
3880 // The latter will get matched to a CSINV instruction.
3881
3882 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3883 SDValue LHS = Sel.getOperand(0);
3884 SDValue RHS = Sel.getOperand(1);
3885 SDValue TVal = Sel.getOperand(2);
3886 SDValue FVal = Sel.getOperand(3);
3887
3888 // FIXME: This could be generalized to non-integer comparisons.
3889 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3890 return Op;
3891
3892 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3893 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3894
3895 // The values aren't constants, this isn't the pattern we're looking for.
3896 if (!CFVal || !CTVal)
3897 return Op;
3898
3899 // We can commute the SELECT_CC by inverting the condition. This
3900 // might be needed to make this fit into a CSINV pattern.
3901 if (CTVal->isAllOnes() && CFVal->isZero()) {
3902 std::swap(TVal, FVal);
3903 std::swap(CTVal, CFVal);
3904 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3905 }
3906
3907 // If the constants line up, perform the transform!
3908 if (CTVal->isZero() && CFVal->isAllOnes()) {
3909 SDValue CCVal;
3910 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3911
3912 FVal = Other;
3913 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3914 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3915
3916 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3917 CCVal, Cmp);
3918 }
3919
3920 return Op;
3921}
3922
3923// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3924// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3925// sets 'C' bit to 0.
3927 SDLoc DL(Value);
3928 EVT VT = Value.getValueType();
3929 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3930 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3931 SDValue Cmp =
3932 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3933 return Cmp.getValue(1);
3934}
3935
3936// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3937// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3939 bool Invert) {
3940 assert(Glue.getResNo() == 1);
3941 SDLoc DL(Glue);
3942 SDValue Zero = DAG.getConstant(0, DL, VT);
3943 SDValue One = DAG.getConstant(1, DL, VT);
3944 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3945 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3946 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3947}
3948
3949// Value is 1 if 'V' bit of NZCV is 1, else 0
3951 assert(Glue.getResNo() == 1);
3952 SDLoc DL(Glue);
3953 SDValue Zero = DAG.getConstant(0, DL, VT);
3954 SDValue One = DAG.getConstant(1, DL, VT);
3955 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3956 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3957}
3958
3959// This lowering is inefficient, but it will get cleaned up by
3960// `foldOverflowCheck`
3962 unsigned Opcode, bool IsSigned) {
3963 EVT VT0 = Op.getValue(0).getValueType();
3964 EVT VT1 = Op.getValue(1).getValueType();
3965
3966 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3967 return SDValue();
3968
3969 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3970 SDValue OpLHS = Op.getOperand(0);
3971 SDValue OpRHS = Op.getOperand(1);
3972 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3973
3974 SDLoc DL(Op);
3975 SDVTList VTs = DAG.getVTList(VT0, VT1);
3976
3977 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3978 OpRHS, OpCarryIn);
3979
3980 SDValue OutFlag =
3981 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3982 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3983
3984 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3985}
3986
3988 // Let legalize expand this if it isn't a legal type yet.
3989 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3990 return SDValue();
3991
3992 SDLoc dl(Op);
3994 // The actual operation that sets the overflow or carry flag.
3995 SDValue Value, Overflow;
3996 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3997
3998 // We use 0 and 1 as false and true values.
3999 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4000 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4001
4002 // We use an inverted condition, because the conditional select is inverted
4003 // too. This will allow it to be selected to a single instruction:
4004 // CSINC Wd, WZR, WZR, invert(cond).
4005 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4006 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4007 CCVal, Overflow);
4008
4009 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4010 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4011}
4012
4013// Prefetch operands are:
4014// 1: Address to prefetch
4015// 2: bool isWrite
4016// 3: int locality (0 = no locality ... 3 = extreme locality)
4017// 4: bool isDataCache
4019 SDLoc DL(Op);
4020 unsigned IsWrite = Op.getConstantOperandVal(2);
4021 unsigned Locality = Op.getConstantOperandVal(3);
4022 unsigned IsData = Op.getConstantOperandVal(4);
4023
4024 bool IsStream = !Locality;
4025 // When the locality number is set
4026 if (Locality) {
4027 // The front-end should have filtered out the out-of-range values
4028 assert(Locality <= 3 && "Prefetch locality out-of-range");
4029 // The locality degree is the opposite of the cache speed.
4030 // Put the number the other way around.
4031 // The encoding starts at 0 for level 1
4032 Locality = 3 - Locality;
4033 }
4034
4035 // built the mask value encoding the expected behavior.
4036 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4037 (!IsData << 3) | // IsDataCache bit
4038 (Locality << 1) | // Cache level bits
4039 (unsigned)IsStream; // Stream bit
4040 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4041 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4042 Op.getOperand(1));
4043}
4044
4045SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4046 SelectionDAG &DAG) const {
4047 EVT VT = Op.getValueType();
4048 if (VT.isScalableVector())
4049 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4050
4051 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4052 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4053
4054 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4055 return SDValue();
4056}
4057
4058SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4059 SelectionDAG &DAG) const {
4060 EVT VT = Op.getValueType();
4061 if (VT.isScalableVector())
4062 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4063
4064 bool IsStrict = Op->isStrictFPOpcode();
4065 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4066 EVT SrcVT = SrcVal.getValueType();
4067 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4068
4069 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4070 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4071
4072 // Expand cases where the result type is BF16 but we don't have hardware
4073 // instructions to lower it.
4074 if (VT.getScalarType() == MVT::bf16 &&
4075 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4076 Subtarget->hasBF16())) {
4077 SDLoc dl(Op);
4078 SDValue Narrow = SrcVal;
4079 SDValue NaN;
4080 EVT I32 = SrcVT.changeElementType(MVT::i32);
4081 EVT F32 = SrcVT.changeElementType(MVT::f32);
4082 if (SrcVT.getScalarType() == MVT::f32) {
4083 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4084 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4085 if (!NeverSNaN) {
4086 // Set the quiet bit.
4087 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4088 DAG.getConstant(0x400000, dl, I32));
4089 }
4090 } else if (SrcVT.getScalarType() == MVT::f64) {
4091 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4092 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4093 } else {
4094 return SDValue();
4095 }
4096 if (!Trunc) {
4097 SDValue One = DAG.getConstant(1, dl, I32);
4098 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4099 DAG.getShiftAmountConstant(16, I32, dl));
4100 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4101 SDValue RoundingBias =
4102 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4103 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4104 }
4105
4106 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4107 // 0x80000000.
4108 if (NaN) {
4109 SDValue IsNaN = DAG.getSetCC(
4110 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4111 SrcVal, SrcVal, ISD::SETUO);
4112 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4113 }
4114
4115 // Now that we have rounded, shift the bits into position.
4116 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4117 DAG.getShiftAmountConstant(16, I32, dl));
4118 if (VT.isVector()) {
4119 EVT I16 = I32.changeVectorElementType(MVT::i16);
4120 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4121 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4122 }
4123 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4124 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4125 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4126 : Result;
4127 }
4128
4129 if (SrcVT != MVT::f128) {
4130 // Expand cases where the input is a vector bigger than NEON.
4132 return SDValue();
4133
4134 // It's legal except when f128 is involved
4135 return Op;
4136 }
4137
4138 return SDValue();
4139}
4140
4141SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4142 SelectionDAG &DAG) const {
4143 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4144 // Any additional optimization in this function should be recorded
4145 // in the cost tables.
4146 bool IsStrict = Op->isStrictFPOpcode();
4147 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4148 EVT VT = Op.getValueType();
4149
4150 if (VT.isScalableVector()) {
4151 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4154 return LowerToPredicatedOp(Op, DAG, Opcode);
4155 }
4156
4157 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4158 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4159 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4160
4161 unsigned NumElts = InVT.getVectorNumElements();
4162
4163 // f16 conversions are promoted to f32 when full fp16 is not supported.
4164 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4165 InVT.getVectorElementType() == MVT::bf16) {
4166 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4167 SDLoc dl(Op);
4168 if (IsStrict) {
4169 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4170 {Op.getOperand(0), Op.getOperand(1)});
4171 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4172 {Ext.getValue(1), Ext.getValue(0)});
4173 }
4174 return DAG.getNode(
4175 Op.getOpcode(), dl, Op.getValueType(),
4176 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4177 }
4178
4179 uint64_t VTSize = VT.getFixedSizeInBits();
4180 uint64_t InVTSize = InVT.getFixedSizeInBits();
4181 if (VTSize < InVTSize) {
4182 SDLoc dl(Op);
4183 if (IsStrict) {
4185 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4186 {Op.getOperand(0), Op.getOperand(1)});
4187 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4188 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4189 }
4190 SDValue Cv =
4191 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4192 Op.getOperand(0));
4193 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4194 }
4195
4196 if (VTSize > InVTSize) {
4197 SDLoc dl(Op);
4198 MVT ExtVT =
4201 if (IsStrict) {
4202 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4203 {Op.getOperand(0), Op.getOperand(1)});
4204 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4205 {Ext.getValue(1), Ext.getValue(0)});
4206 }
4207 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4208 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4209 }
4210
4211 // Use a scalar operation for conversions between single-element vectors of
4212 // the same size.
4213 if (NumElts == 1) {
4214 SDLoc dl(Op);
4215 SDValue Extract = DAG.getNode(
4217 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4218 EVT ScalarVT = VT.getScalarType();
4219 if (IsStrict)
4220 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4221 {Op.getOperand(0), Extract});
4222 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4223 }
4224
4225 // Type changing conversions are illegal.
4226 return Op;
4227}
4228
4229SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4230 SelectionDAG &DAG) const {
4231 bool IsStrict = Op->isStrictFPOpcode();
4232 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4233
4234 if (SrcVal.getValueType().isVector())
4235 return LowerVectorFP_TO_INT(Op, DAG);
4236
4237 // f16 conversions are promoted to f32 when full fp16 is not supported.
4238 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4239 SrcVal.getValueType() == MVT::bf16) {
4240 SDLoc dl(Op);
4241 if (IsStrict) {
4242 SDValue Ext =
4243 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4244 {Op.getOperand(0), SrcVal});
4245 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4246 {Ext.getValue(1), Ext.getValue(0)});
4247 }
4248 return DAG.getNode(
4249 Op.getOpcode(), dl, Op.getValueType(),
4250 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4251 }
4252
4253 if (SrcVal.getValueType() != MVT::f128) {
4254 // It's legal except when f128 is involved
4255 return Op;
4256 }
4257
4258 return SDValue();
4259}
4260
4261SDValue
4262AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4263 SelectionDAG &DAG) const {
4264 // AArch64 FP-to-int conversions saturate to the destination element size, so
4265 // we can lower common saturating conversions to simple instructions.
4266 SDValue SrcVal = Op.getOperand(0);
4267 EVT SrcVT = SrcVal.getValueType();
4268 EVT DstVT = Op.getValueType();
4269 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4270
4271 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4272 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4273 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4274 assert(SatWidth <= DstElementWidth &&
4275 "Saturation width cannot exceed result width");
4276
4277 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4278 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4279 // types, so this is hard to reach.
4280 if (DstVT.isScalableVector())
4281 return SDValue();
4282
4283 EVT SrcElementVT = SrcVT.getVectorElementType();
4284
4285 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4286 if ((SrcElementVT == MVT::f16 &&
4287 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4288 SrcElementVT == MVT::bf16) {
4289 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4290 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4291 SrcVT = F32VT;
4292 SrcElementVT = MVT::f32;
4293 SrcElementWidth = 32;
4294 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4295 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4296 return SDValue();
4297
4298 SDLoc DL(Op);
4299 // Expand to f64 if we are saturating to i64, to help produce keep the lanes
4300 // the same width and produce a fcvtzu.
4301 if (SatWidth == 64 && SrcElementWidth < 64) {
4302 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4303 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4304 SrcVT = F64VT;
4305 SrcElementVT = MVT::f64;
4306 SrcElementWidth = 64;
4307 }
4308 // Cases that we can emit directly.
4309 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4310 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4311 DAG.getValueType(DstVT.getScalarType()));
4312
4313 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4314 // result. This is only valid if the legal cvt is larger than the saturate
4315 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4316 // (at least until sqxtn is selected).
4317 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4318 return SDValue();
4319
4320 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4321 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4322 DAG.getValueType(IntVT.getScalarType()));
4323 SDValue Sat;
4324 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4325 SDValue MinC = DAG.getConstant(
4326 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4327 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4328 SDValue MaxC = DAG.getConstant(
4329 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4330 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4331 } else {
4332 SDValue MinC = DAG.getConstant(
4333 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4334 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4335 }
4336
4337 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4338}
4339
4340SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4341 SelectionDAG &DAG) const {
4342 // AArch64 FP-to-int conversions saturate to the destination register size, so
4343 // we can lower common saturating conversions to simple instructions.
4344 SDValue SrcVal = Op.getOperand(0);
4345 EVT SrcVT = SrcVal.getValueType();
4346
4347 if (SrcVT.isVector())
4348 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4349
4350 EVT DstVT = Op.getValueType();
4351 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4352 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4353 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4354 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4355
4356 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4357 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4358 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4359 SrcVT = MVT::f32;
4360 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4361 SrcVT != MVT::bf16)
4362 return SDValue();
4363
4364 SDLoc DL(Op);
4365 // Cases that we can emit directly.
4366 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4367 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4368 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4369 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4370 DAG.getValueType(DstVT));
4371
4372 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4373 // result. This is only valid if the legal cvt is larger than the saturate
4374 // width.
4375 if (DstWidth < SatWidth)
4376 return SDValue();
4377
4378 SDValue NativeCvt =
4379 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4380 SDValue Sat;
4381 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4382 SDValue MinC = DAG.getConstant(
4383 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4384 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4385 SDValue MaxC = DAG.getConstant(
4386 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4387 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4388 } else {
4389 SDValue MinC = DAG.getConstant(
4390 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4391 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4392 }
4393
4394 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4395}
4396
4397SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4398 SelectionDAG &DAG) const {
4399 EVT VT = Op.getValueType();
4400 SDValue Src = Op.getOperand(0);
4401 SDLoc DL(Op);
4402
4403 assert(VT.isVector() && "Expected vector type");
4404
4405 EVT CastVT =
4406 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4407
4408 // Round the floating-point value into a floating-point register with the
4409 // current rounding mode.
4410 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4411
4412 // Truncate the rounded floating point to an integer.
4413 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4415}
4416
4417SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4418 SelectionDAG &DAG) const {
4419 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4420 // Any additional optimization in this function should be recorded
4421 // in the cost tables.
4422 bool IsStrict = Op->isStrictFPOpcode();
4423 EVT VT = Op.getValueType();
4424 SDLoc dl(Op);
4425 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4426 EVT InVT = In.getValueType();
4427 unsigned Opc = Op.getOpcode();
4428 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4429
4430 if (VT.isScalableVector()) {
4431 if (InVT.getVectorElementType() == MVT::i1) {
4432 // We can't directly extend an SVE predicate; extend it first.
4433 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4434 EVT CastVT = getPromotedVTForPredicate(InVT);
4435 In = DAG.getNode(CastOpc, dl, CastVT, In);
4436 return DAG.getNode(Opc, dl, VT, In);
4437 }
4438
4439 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4441 return LowerToPredicatedOp(Op, DAG, Opcode);
4442 }
4443
4444 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4445 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4446 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4447
4448 // Promote bf16 conversions to f32.
4449 if (VT.getVectorElementType() == MVT::bf16) {
4450 EVT F32 = VT.changeElementType(MVT::f32);
4451 if (IsStrict) {
4452 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4453 {Op.getOperand(0), In});
4454 return DAG.getNode(
4455 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4456 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4457 }
4458 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4459 DAG.getNode(Op.getOpcode(), dl, F32, In),
4460 DAG.getIntPtrConstant(0, dl));
4461 }
4462
4463 uint64_t VTSize = VT.getFixedSizeInBits();
4464 uint64_t InVTSize = InVT.getFixedSizeInBits();
4465 if (VTSize < InVTSize) {
4466 MVT CastVT =
4468 InVT.getVectorNumElements());
4469 if (IsStrict) {
4470 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4471 {Op.getOperand(0), In});
4472 return DAG.getNode(
4473 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4474 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4475 }
4476 In = DAG.getNode(Opc, dl, CastVT, In);
4477 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4478 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4479 }
4480
4481 if (VTSize > InVTSize) {
4482 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4484 In = DAG.getNode(CastOpc, dl, CastVT, In);
4485 if (IsStrict)
4486 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4487 return DAG.getNode(Opc, dl, VT, In);
4488 }
4489
4490 // Use a scalar operation for conversions between single-element vectors of
4491 // the same size.
4492 if (VT.getVectorNumElements() == 1) {
4493 SDValue Extract = DAG.getNode(
4495 In, DAG.getConstant(0, dl, MVT::i64));
4496 EVT ScalarVT = VT.getScalarType();
4497 if (IsStrict)
4498 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4499 {Op.getOperand(0), Extract});
4500 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4501 }
4502
4503 return Op;
4504}
4505
4506SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4507 SelectionDAG &DAG) const {
4508 if (Op.getValueType().isVector())
4509 return LowerVectorINT_TO_FP(Op, DAG);
4510
4511 bool IsStrict = Op->isStrictFPOpcode();
4512 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4513
4514 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4515 Op->getOpcode() == ISD::SINT_TO_FP;
4516
4517 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4518 SDLoc dl(Op);
4519 if (IsStrict) {
4520 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4521 {Op.getOperand(0), SrcVal});
4522 return DAG.getNode(
4523 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4524 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4525 }
4526 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4527 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4528 DAG.getIntPtrConstant(0, dl));
4529 };
4530
4531 if (Op.getValueType() == MVT::bf16) {
4532 unsigned MaxWidth = IsSigned
4533 ? DAG.ComputeMaxSignificantBits(SrcVal)
4534 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4535 // bf16 conversions are promoted to f32 when converting from i16.
4536 if (MaxWidth <= 24) {
4537 return IntToFpViaPromotion(MVT::f32);
4538 }
4539
4540 // bf16 conversions are promoted to f64 when converting from i32.
4541 if (MaxWidth <= 53) {
4542 return IntToFpViaPromotion(MVT::f64);
4543 }
4544
4545 // We need to be careful about i64 -> bf16.
4546 // Consider an i32 22216703.
4547 // This number cannot be represented exactly as an f32 and so a itofp will
4548 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4549 // However, the correct bf16 was supposed to be 22151168.0
4550 // We need to use sticky rounding to get this correct.
4551 if (SrcVal.getValueType() == MVT::i64) {
4552 SDLoc DL(Op);
4553 // This algorithm is equivalent to the following:
4554 // uint64_t SrcHi = SrcVal & ~0xfffull;
4555 // uint64_t SrcLo = SrcVal & 0xfffull;
4556 // uint64_t Highest = SrcVal >> 53;
4557 // bool HasHighest = Highest != 0;
4558 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4559 // double Rounded = static_cast<double>(ToRound);
4560 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4561 // uint64_t HasLo = SrcLo != 0;
4562 // bool NeedsAdjustment = HasHighest & HasLo;
4563 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4564 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4565 // return static_cast<__bf16>(Adjusted);
4566 //
4567 // Essentially, what happens is that SrcVal either fits perfectly in a
4568 // double-precision value or it is too big. If it is sufficiently small,
4569 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4570 // ensure that u64 -> double has no rounding error by only using the 52
4571 // MSB of the input. The low order bits will get merged into a sticky bit
4572 // which will avoid issues incurred by double rounding.
4573
4574 // Signed conversion is more or less like so:
4575 // copysign((__bf16)abs(SrcVal), SrcVal)
4576 SDValue SignBit;
4577 if (IsSigned) {
4578 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4579 DAG.getConstant(1ull << 63, DL, MVT::i64));
4580 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4581 }
4582 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4583 DAG.getConstant(~0xfffull, DL, MVT::i64));
4584 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4585 DAG.getConstant(0xfffull, DL, MVT::i64));
4587 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4588 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4589 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4590 SDValue ToRound =
4591 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4592 SDValue Rounded =
4593 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4594 {Op.getOperand(0), ToRound})
4595 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4596
4597 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4598 if (SignBit) {
4599 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4600 }
4601
4602 SDValue HasHighest = DAG.getSetCC(
4603 DL,
4604 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4605 Highest, Zero64, ISD::SETNE);
4606
4607 SDValue HasLo = DAG.getSetCC(
4608 DL,
4609 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4610 SrcLo, Zero64, ISD::SETNE);
4611
4612 SDValue NeedsAdjustment =
4613 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4614 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4615
4616 SDValue AdjustedBits =
4617 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4618 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4619 return IsStrict
4621 {Op.getValueType(), MVT::Other},
4622 {Rounded.getValue(1), Adjusted,
4623 DAG.getIntPtrConstant(0, DL)})
4624 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4625 DAG.getIntPtrConstant(0, DL, true));
4626 }
4627 }
4628
4629 // f16 conversions are promoted to f32 when full fp16 is not supported.
4630 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4631 return IntToFpViaPromotion(MVT::f32);
4632 }
4633
4634 // i128 conversions are libcalls.
4635 if (SrcVal.getValueType() == MVT::i128)
4636 return SDValue();
4637
4638 // Other conversions are legal, unless it's to the completely software-based
4639 // fp128.
4640 if (Op.getValueType() != MVT::f128)
4641 return Op;
4642 return SDValue();
4643}
4644
4645SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4646 SelectionDAG &DAG) const {
4647 // For iOS, we want to call an alternative entry point: __sincos_stret,
4648 // which returns the values in two S / D registers.
4649 SDLoc dl(Op);
4650 SDValue Arg = Op.getOperand(0);
4651 EVT ArgVT = Arg.getValueType();
4652 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4653
4655 ArgListEntry Entry;
4656
4657 Entry.Node = Arg;
4658 Entry.Ty = ArgTy;
4659 Entry.IsSExt = false;
4660 Entry.IsZExt = false;
4661 Args.push_back(Entry);
4662
4663 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4664 : RTLIB::SINCOS_STRET_F32;
4665 const char *LibcallName = getLibcallName(LC);
4666 SDValue Callee =
4667 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4668
4669 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4671 CLI.setDebugLoc(dl)
4672 .setChain(DAG.getEntryNode())
4673 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4674
4675 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4676 return CallResult.first;
4677}
4678
4679static MVT getSVEContainerType(EVT ContentTy);
4680
4681SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4682 SelectionDAG &DAG) const {
4683 EVT OpVT = Op.getValueType();
4684 EVT ArgVT = Op.getOperand(0).getValueType();
4685
4687 return LowerFixedLengthBitcastToSVE(Op, DAG);
4688
4689 if (OpVT.isScalableVector()) {
4690 // Bitcasting between unpacked vector types of different element counts is
4691 // not a NOP because the live elements are laid out differently.
4692 // 01234567
4693 // e.g. nxv2i32 = XX??XX??
4694 // nxv4f16 = X?X?X?X?
4695 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4696 return SDValue();
4697
4698 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4699 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4700 "Expected int->fp bitcast!");
4701 SDValue ExtResult =
4703 Op.getOperand(0));
4704 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4705 }
4706 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4707 }
4708
4709 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4710 return SDValue();
4711
4712 // Bitcasts between f16 and bf16 are legal.
4713 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4714 return Op;
4715
4716 assert(ArgVT == MVT::i16);
4717 SDLoc DL(Op);
4718
4719 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4720 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4721 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4722}
4723
4724static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4725 if (OrigVT.getSizeInBits() >= 64)
4726 return OrigVT;
4727
4728 assert(OrigVT.isSimple() && "Expecting a simple value type");
4729
4730 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4731 switch (OrigSimpleTy) {
4732 default: llvm_unreachable("Unexpected Vector Type");
4733 case MVT::v2i8:
4734 case MVT::v2i16:
4735 return MVT::v2i32;
4736 case MVT::v4i8:
4737 return MVT::v4i16;
4738 }
4739}
4740
4742 const EVT &OrigTy,
4743 const EVT &ExtTy,
4744 unsigned ExtOpcode) {
4745 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4746 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4747 // 64-bits we need to insert a new extension so that it will be 64-bits.
4748 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4749 if (OrigTy.getSizeInBits() >= 64)
4750 return N;
4751
4752 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4753 EVT NewVT = getExtensionTo64Bits(OrigTy);
4754
4755 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4756}
4757
4758// Returns lane if Op extracts from a two-element vector and lane is constant
4759// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4760static std::optional<uint64_t>
4762 SDNode *OpNode = Op.getNode();
4763 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4764 return std::nullopt;
4765
4766 EVT VT = OpNode->getOperand(0).getValueType();
4767 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4768 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4769 return std::nullopt;
4770
4771 return C->getZExtValue();
4772}
4773
4775 bool isSigned) {
4776 EVT VT = N.getValueType();
4777
4778 if (N.getOpcode() != ISD::BUILD_VECTOR)
4779 return false;
4780
4781 for (const SDValue &Elt : N->op_values()) {
4782 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4783 unsigned EltSize = VT.getScalarSizeInBits();
4784 unsigned HalfSize = EltSize / 2;
4785 if (isSigned) {
4786 if (!isIntN(HalfSize, C->getSExtValue()))
4787 return false;
4788 } else {
4789 if (!isUIntN(HalfSize, C->getZExtValue()))
4790 return false;
4791 }
4792 continue;
4793 }
4794 return false;
4795 }
4796
4797 return true;
4798}
4799
4801 EVT VT = N.getValueType();
4802 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4803
4804 unsigned NumElts = VT.getVectorNumElements();
4805 unsigned OrigEltSize = VT.getScalarSizeInBits();
4806 unsigned EltSize = OrigEltSize / 2;
4807 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4808
4809 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4810 if (DAG.MaskedValueIsZero(N, HiBits))
4811 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4812
4813 if (ISD::isExtOpcode(N.getOpcode()))
4814 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4815 N.getOperand(0).getValueType(), VT,
4816 N.getOpcode());
4817
4818 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4819 SDLoc dl(N);
4821 for (unsigned i = 0; i != NumElts; ++i) {
4822 const APInt &CInt = N.getConstantOperandAPInt(i);
4823 // Element types smaller than 32 bits are not legal, so use i32 elements.
4824 // The values are implicitly truncated so sext vs. zext doesn't matter.
4825 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4826 }
4827 return DAG.getBuildVector(TruncVT, dl, Ops);
4828}
4829
4831 return N.getOpcode() == ISD::SIGN_EXTEND ||
4832 N.getOpcode() == ISD::ANY_EXTEND ||
4833 isExtendedBUILD_VECTOR(N, DAG, true);
4834}
4835
4837 return N.getOpcode() == ISD::ZERO_EXTEND ||
4838 N.getOpcode() == ISD::ANY_EXTEND ||
4839 isExtendedBUILD_VECTOR(N, DAG, false);
4840}
4841
4843 unsigned Opcode = N.getOpcode();
4844 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4845 SDValue N0 = N.getOperand(0);
4846 SDValue N1 = N.getOperand(1);
4847 return N0->hasOneUse() && N1->hasOneUse() &&
4848 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4849 }
4850 return false;
4851}
4852
4854 unsigned Opcode = N.getOpcode();
4855 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4856 SDValue N0 = N.getOperand(0);
4857 SDValue N1 = N.getOperand(1);
4858 return N0->hasOneUse() && N1->hasOneUse() &&
4859 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4860 }
4861 return false;
4862}
4863
4864SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4865 SelectionDAG &DAG) const {
4866 // The rounding mode is in bits 23:22 of the FPSCR.
4867 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4868 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4869 // so that the shift + and get folded into a bitfield extract.
4870 SDLoc dl(Op);
4871
4872 SDValue Chain = Op.getOperand(0);
4873 SDValue FPCR_64 = DAG.getNode(
4874 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4875 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4876 Chain = FPCR_64.getValue(1);
4877 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4878 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4879 DAG.getConstant(1U << 22, dl, MVT::i32));
4880 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4881 DAG.getConstant(22, dl, MVT::i32));
4882 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4883 DAG.getConstant(3, dl, MVT::i32));
4884 return DAG.getMergeValues({AND, Chain}, dl);
4885}
4886
4887SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4888 SelectionDAG &DAG) const {
4889 SDLoc DL(Op);
4890 SDValue Chain = Op->getOperand(0);
4891 SDValue RMValue = Op->getOperand(1);
4892
4893 // The rounding mode is in bits 23:22 of the FPCR.
4894 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4895 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4896 // ((arg - 1) & 3) << 22).
4897 //
4898 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4899 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4900 // generated llvm.set.rounding to ensure this condition.
4901
4902 // Calculate new value of FPCR[23:22].
4903 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4904 DAG.getConstant(1, DL, MVT::i32));
4905 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4906 DAG.getConstant(0x3, DL, MVT::i32));
4907 RMValue =
4908 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4909 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4910 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4911
4912 // Get current value of FPCR.
4913 SDValue Ops[] = {
4914 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4915 SDValue FPCR =
4916 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4917 Chain = FPCR.getValue(1);
4918 FPCR = FPCR.getValue(0);
4919
4920 // Put new rounding mode into FPSCR[23:22].
4921 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4922 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4923 DAG.getConstant(RMMask, DL, MVT::i64));
4924 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4925 SDValue Ops2[] = {
4926 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4927 FPCR};
4928 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4929}
4930
4931SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
4932 SelectionDAG &DAG) const {
4933 SDLoc DL(Op);
4934 SDValue Chain = Op->getOperand(0);
4935
4936 // Get current value of FPCR.
4937 SDValue Ops[] = {
4938 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4939 SDValue FPCR =
4940 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4941 Chain = FPCR.getValue(1);
4942 FPCR = FPCR.getValue(0);
4943
4944 // Truncate FPCR to 32 bits.
4945 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
4946
4947 return DAG.getMergeValues({Result, Chain}, DL);
4948}
4949
4950SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
4951 SelectionDAG &DAG) const {
4952 SDLoc DL(Op);
4953 SDValue Chain = Op->getOperand(0);
4954 SDValue Mode = Op->getOperand(1);
4955
4956 // Extend the specified value to 64 bits.
4957 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
4958
4959 // Set new value of FPCR.
4960 SDValue Ops2[] = {
4961 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
4962 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4963}
4964
4965SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
4966 SelectionDAG &DAG) const {
4967 SDLoc DL(Op);
4968 SDValue Chain = Op->getOperand(0);
4969
4970 // Get current value of FPCR.
4971 SDValue Ops[] = {
4972 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4973 SDValue FPCR =
4974 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4975 Chain = FPCR.getValue(1);
4976 FPCR = FPCR.getValue(0);
4977
4978 // Clear bits that are not reserved.
4979 SDValue FPSCRMasked = DAG.getNode(
4980 ISD::AND, DL, MVT::i64, FPCR,
4982
4983 // Set new value of FPCR.
4984 SDValue Ops2[] = {Chain,
4985 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4986 FPSCRMasked};
4987 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4988}
4989
4990static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4991 SDLoc DL, bool &IsMLA) {
4992 bool IsN0SExt = isSignExtended(N0, DAG);
4993 bool IsN1SExt = isSignExtended(N1, DAG);
4994 if (IsN0SExt && IsN1SExt)
4995 return AArch64ISD::SMULL;
4996
4997 bool IsN0ZExt = isZeroExtended(N0, DAG);
4998 bool IsN1ZExt = isZeroExtended(N1, DAG);
4999
5000 if (IsN0ZExt && IsN1ZExt)
5001 return AArch64ISD::UMULL;
5002
5003 // Select SMULL if we can replace zext with sext.
5004 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
5005 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
5006 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
5007 SDValue ZextOperand;
5008 if (IsN0ZExt)
5009 ZextOperand = N0.getOperand(0);
5010 else
5011 ZextOperand = N1.getOperand(0);
5012 if (DAG.SignBitIsZero(ZextOperand)) {
5013 SDValue NewSext =
5014 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
5015 if (IsN0ZExt)
5016 N0 = NewSext;
5017 else
5018 N1 = NewSext;
5019 return AArch64ISD::SMULL;
5020 }
5021 }
5022
5023 // Select UMULL if we can replace the other operand with an extend.
5024 if (IsN0ZExt || IsN1ZExt) {
5025 EVT VT = N0.getValueType();
5027 VT.getScalarSizeInBits() / 2);
5028 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5029 return AArch64ISD::UMULL;
5030 }
5031
5032 if (!IsN1SExt && !IsN1ZExt)
5033 return 0;
5034
5035 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5036 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5037 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5038 IsMLA = true;
5039 return AArch64ISD::SMULL;
5040 }
5041 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5042 IsMLA = true;
5043 return AArch64ISD::UMULL;
5044 }
5045 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5046 std::swap(N0, N1);
5047 IsMLA = true;
5048 return AArch64ISD::UMULL;
5049 }
5050 return 0;
5051}
5052
5053SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5054 EVT VT = Op.getValueType();
5055
5056 bool OverrideNEON = !Subtarget->isNeonAvailable();
5057 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5058 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5059
5060 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5061 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5062 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5063 "unexpected type for custom-lowering ISD::MUL");
5064 SDValue N0 = Op.getOperand(0);
5065 SDValue N1 = Op.getOperand(1);
5066 bool isMLA = false;
5067 EVT OVT = VT;
5068 if (VT.is64BitVector()) {
5069 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5070 isNullConstant(N0.getOperand(1)) &&
5072 isNullConstant(N1.getOperand(1))) {
5073 N0 = N0.getOperand(0);
5074 N1 = N1.getOperand(0);
5075 VT = N0.getValueType();
5076 } else {
5077 if (VT == MVT::v1i64) {
5078 if (Subtarget->hasSVE())
5079 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5080 // Fall through to expand this. It is not legal.
5081 return SDValue();
5082 } else
5083 // Other vector multiplications are legal.
5084 return Op;
5085 }
5086 }
5087
5088 SDLoc DL(Op);
5089 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5090
5091 if (!NewOpc) {
5092 if (VT.getVectorElementType() == MVT::i64) {
5093 // If SVE is available then i64 vector multiplications can also be made
5094 // legal.
5095 if (Subtarget->hasSVE())
5096 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5097 // Fall through to expand this. It is not legal.
5098 return SDValue();
5099 } else
5100 // Other vector multiplications are legal.
5101 return Op;
5102 }
5103
5104 // Legalize to a S/UMULL instruction
5105 SDValue Op0;
5106 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5107 if (!isMLA) {
5108 Op0 = skipExtensionForVectorMULL(N0, DAG);
5110 Op1.getValueType().is64BitVector() &&
5111 "unexpected types for extended operands to VMULL");
5112 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5113 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5114 DAG.getConstant(0, DL, MVT::i64));
5115 }
5116 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5117 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5118 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5121 EVT Op1VT = Op1.getValueType();
5122 return DAG.getNode(
5124 DAG.getNode(N0.getOpcode(), DL, VT,
5125 DAG.getNode(NewOpc, DL, VT,
5126 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5127 DAG.getNode(NewOpc, DL, VT,
5128 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5129 DAG.getConstant(0, DL, MVT::i64));
5130}
5131
5132static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5133 int Pattern) {
5134 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5135 return DAG.getConstant(1, DL, MVT::nxv1i1);
5136 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5137 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5138}
5139
5141 bool IsSigned, bool IsEqual) {
5142 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5143 !isa<ConstantSDNode>(Op.getOperand(2)))
5144 return SDValue();
5145
5146 SDLoc dl(Op);
5147 APInt X = Op.getConstantOperandAPInt(1);
5148 APInt Y = Op.getConstantOperandAPInt(2);
5149 bool Overflow;
5150 APInt NumActiveElems =
5151 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5152
5153 if (Overflow)
5154 return SDValue();
5155
5156 if (IsEqual) {
5157 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5158 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5159 : NumActiveElems.uadd_ov(One, Overflow);
5160 if (Overflow)
5161 return SDValue();
5162 }
5163
5164 std::optional<unsigned> PredPattern =
5166 unsigned MinSVEVectorSize = std::max(
5168 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5169 if (PredPattern != std::nullopt &&
5170 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5171 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5172
5173 return SDValue();
5174}
5175
5176// Returns a safe bitcast between two scalable vector predicates, where
5177// any newly created lanes from a widening bitcast are defined as zero.
5179 SDLoc DL(Op);
5180 EVT InVT = Op.getValueType();
5181
5182 assert(InVT.getVectorElementType() == MVT::i1 &&
5183 VT.getVectorElementType() == MVT::i1 &&
5184 "Expected a predicate-to-predicate bitcast");
5186 InVT.isScalableVector() &&
5187 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5188 "Only expect to cast between legal scalable predicate types!");
5189
5190 // Return the operand if the cast isn't changing type,
5191 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5192 if (InVT == VT)
5193 return Op;
5194
5195 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5196
5197 // We only have to zero the lanes if new lanes are being defined, e.g. when
5198 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5199 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5200 // we can return here.
5201 if (InVT.bitsGT(VT))
5202 return Reinterpret;
5203
5204 // Check if the other lanes are already known to be zeroed by
5205 // construction.
5207 return Reinterpret;
5208
5209 // Zero the newly introduced lanes.
5210 SDValue Mask = DAG.getConstant(1, DL, InVT);
5211 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5212 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5213}
5214
5215SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5216 SDValue Chain, SDLoc DL,
5217 EVT VT) const {
5218 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5220 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5221 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5224 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5226 RetTy, Callee, std::move(Args));
5227 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5228 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5229 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5230 Mask);
5231}
5232
5233// Lower an SME LDR/STR ZA intrinsic
5234// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5235// folded into the instruction
5236// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5237// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5238// and tile slice registers
5239// ldr(%tileslice, %ptr, %vecnum)
5240// ->
5241// %svl = rdsvl
5242// %ptr2 = %ptr + %svl * %vecnum
5243// %tileslice2 = %tileslice + %vecnum
5244// ldr [%tileslice2, 0], [%ptr2, 0]
5245// Case 3: If the vecnum is an immediate out of range, then the same is done as
5246// case 2, but the base and slice registers are modified by the greatest
5247// multiple of 15 lower than the vecnum and the remainder is folded into the
5248// instruction. This means that successive loads and stores that are offset from
5249// each other can share the same base and slice register updates.
5250// ldr(%tileslice, %ptr, 22)
5251// ldr(%tileslice, %ptr, 23)
5252// ->
5253// %svl = rdsvl
5254// %ptr2 = %ptr + %svl * 15
5255// %tileslice2 = %tileslice + 15
5256// ldr [%tileslice2, 7], [%ptr2, 7]
5257// ldr [%tileslice2, 8], [%ptr2, 8]
5258// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5259// operand and the immediate can be folded into the instruction, like case 2.
5260// ldr(%tileslice, %ptr, %vecnum + 7)
5261// ldr(%tileslice, %ptr, %vecnum + 8)
5262// ->
5263// %svl = rdsvl
5264// %ptr2 = %ptr + %svl * %vecnum
5265// %tileslice2 = %tileslice + %vecnum
5266// ldr [%tileslice2, 7], [%ptr2, 7]
5267// ldr [%tileslice2, 8], [%ptr2, 8]
5268// Case 5: The vecnum being an add of an immediate out of range is also handled,
5269// in which case the same remainder logic as case 3 is used.
5271 SDLoc DL(N);
5272
5273 SDValue TileSlice = N->getOperand(2);
5274 SDValue Base = N->getOperand(3);
5275 SDValue VecNum = N->getOperand(4);
5276 int32_t ConstAddend = 0;
5277 SDValue VarAddend = VecNum;
5278
5279 // If the vnum is an add of an immediate, we can fold it into the instruction
5280 if (VecNum.getOpcode() == ISD::ADD &&
5281 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5282 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5283 VarAddend = VecNum.getOperand(0);
5284 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5285 ConstAddend = ImmNode->getSExtValue();
5286 VarAddend = SDValue();
5287 }
5288
5289 int32_t ImmAddend = ConstAddend % 16;
5290 if (int32_t C = (ConstAddend - ImmAddend)) {
5291 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5292 VarAddend = VarAddend
5293 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5294 : CVal;
5295 }
5296
5297 if (VarAddend) {
5298 // Get the vector length that will be multiplied by vnum
5299 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5300 DAG.getConstant(1, DL, MVT::i32));
5301
5302 // Multiply SVL and vnum then add it to the base
5303 SDValue Mul = DAG.getNode(
5304 ISD::MUL, DL, MVT::i64,
5305 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5306 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5307 // Just add vnum to the tileslice
5308 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5309 }
5310
5312 DL, MVT::Other,
5313 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5314 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5315}
5316
5317SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5318 SelectionDAG &DAG) const {
5319 unsigned IntNo = Op.getConstantOperandVal(1);
5320 SDLoc DL(Op);
5321 switch (IntNo) {
5322 default:
5323 return SDValue(); // Don't custom lower most intrinsics.
5324 case Intrinsic::aarch64_prefetch: {
5325 SDValue Chain = Op.getOperand(0);
5326 SDValue Addr = Op.getOperand(2);
5327
5328 unsigned IsWrite = Op.getConstantOperandVal(3);
5329 unsigned Locality = Op.getConstantOperandVal(4);
5330 unsigned IsStream = Op.getConstantOperandVal(5);
5331 unsigned IsData = Op.getConstantOperandVal(6);
5332 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5333 (!IsData << 3) | // IsDataCache bit
5334 (Locality << 1) | // Cache level bits
5335 (unsigned)IsStream; // Stream bit
5336
5337 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5338 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5339 }
5340 case Intrinsic::aarch64_sme_str:
5341 case Intrinsic::aarch64_sme_ldr: {
5342 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5343 }
5344 case Intrinsic::aarch64_sme_za_enable:
5345 return DAG.getNode(
5346 AArch64ISD::SMSTART, DL, MVT::Other,
5347 Op->getOperand(0), // Chain
5348 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5349 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5350 case Intrinsic::aarch64_sme_za_disable:
5351 return DAG.getNode(
5352 AArch64ISD::SMSTOP, DL, MVT::Other,
5353 Op->getOperand(0), // Chain
5354 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5355 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5356 }
5357}
5358
5359SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5360 SelectionDAG &DAG) const {
5361 unsigned IntNo = Op.getConstantOperandVal(1);
5362 SDLoc DL(Op);
5363 switch (IntNo) {
5364 default:
5365 return SDValue(); // Don't custom lower most intrinsics.
5366 case Intrinsic::aarch64_mops_memset_tag: {
5367 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5368 SDValue Chain = Node->getChain();
5369 SDValue Dst = Op.getOperand(2);
5370 SDValue Val = Op.getOperand(3);
5371 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5372 SDValue Size = Op.getOperand(4);
5373 auto Alignment = Node->getMemOperand()->getAlign();
5374 bool IsVol = Node->isVolatile();
5375 auto DstPtrInfo = Node->getPointerInfo();
5376
5377 const auto &SDI =
5378 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5379 SDValue MS =
5380 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5381 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5382
5383 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5384 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5385 // LowerOperationWrapper will complain that the number of results has
5386 // changed.
5387 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5388 }
5389 }
5390}
5391
5392SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5393 SelectionDAG &DAG) const {
5394 unsigned IntNo = Op.getConstantOperandVal(0);
5395 SDLoc dl(Op);
5396 switch (IntNo) {
5397 default: return SDValue(); // Don't custom lower most intrinsics.
5398 case Intrinsic::thread_pointer: {
5399 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5400 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5401 }
5402 case Intrinsic::aarch64_neon_abs: {
5403 EVT Ty = Op.getValueType();
5404 if (Ty == MVT::i64) {
5405 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5406 Op.getOperand(1));
5407 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5408 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5409 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5410 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5411 } else {
5412 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5413 }
5414 }
5415 case Intrinsic::aarch64_neon_pmull64: {
5416 SDValue LHS = Op.getOperand(1);
5417 SDValue RHS = Op.getOperand(2);
5418
5419 std::optional<uint64_t> LHSLane =
5421 std::optional<uint64_t> RHSLane =
5423
5424 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5425 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5426
5427 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5428 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5429 // which ISel recognizes better. For example, generate a ldr into d*
5430 // registers as opposed to a GPR load followed by a fmov.
5431 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5432 std::optional<uint64_t> OtherLane,
5433 const SDLoc &dl,
5434 SelectionDAG &DAG) -> SDValue {
5435 // If the operand is an higher half itself, rewrite it to
5436 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5437 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5438 if (NLane && *NLane == 1)
5439 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5440 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5441
5442 // Operand N is not a higher half but the other operand is.
5443 if (OtherLane && *OtherLane == 1) {
5444 // If this operand is a lower half, rewrite it to
5445 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5446 // align lanes of two operands. A roundtrip sequence (to move from lane
5447 // 1 to lane 0) is like this:
5448 // mov x8, v0.d[1]
5449 // fmov d0, x8
5450 if (NLane && *NLane == 0)
5451 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5452 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5453 N.getOperand(0),
5454 DAG.getConstant(0, dl, MVT::i64)),
5455 DAG.getConstant(1, dl, MVT::i64));
5456
5457 // Otherwise just dup from main to all lanes.
5458 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5459 }
5460
5461 // Neither operand is an extract of higher half, so codegen may just use
5462 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5463 assert(N.getValueType() == MVT::i64 &&
5464 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5465 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5466 };
5467
5468 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5469 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5470
5471 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5472 }
5473 case Intrinsic::aarch64_neon_smax:
5474 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5475 Op.getOperand(1), Op.getOperand(2));
5476 case Intrinsic::aarch64_neon_umax:
5477 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5478 Op.getOperand(1), Op.getOperand(2));
5479 case Intrinsic::aarch64_neon_smin:
5480 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5481 Op.getOperand(1), Op.getOperand(2));
5482 case Intrinsic::aarch64_neon_umin:
5483 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5484 Op.getOperand(1), Op.getOperand(2));
5485 case Intrinsic::aarch64_neon_scalar_sqxtn:
5486 case Intrinsic::aarch64_neon_scalar_sqxtun:
5487 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5488 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5489 if (Op.getValueType() == MVT::i32)
5490 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5491 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5492 Op.getOperand(0),
5493 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5494 Op.getOperand(1))));
5495 return SDValue();
5496 }
5497 case Intrinsic::aarch64_sve_whilelo:
5498 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5499 /*IsEqual=*/false);
5500 case Intrinsic::aarch64_sve_whilelt:
5501 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5502 /*IsEqual=*/false);
5503 case Intrinsic::aarch64_sve_whilels:
5504 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5505 /*IsEqual=*/true);
5506 case Intrinsic::aarch64_sve_whilele:
5507 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5508 /*IsEqual=*/true);
5509 case Intrinsic::aarch64_sve_sunpkhi:
5510 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5511 Op.getOperand(1));
5512 case Intrinsic::aarch64_sve_sunpklo:
5513 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5514 Op.getOperand(1));
5515 case Intrinsic::aarch64_sve_uunpkhi:
5516 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5517 Op.getOperand(1));
5518 case Intrinsic::aarch64_sve_uunpklo:
5519 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5520 Op.getOperand(1));
5521 case Intrinsic::aarch64_sve_clasta_n:
5522 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5523 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5524 case Intrinsic::aarch64_sve_clastb_n:
5525 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5526 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5527 case Intrinsic::aarch64_sve_lasta:
5528 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5529 Op.getOperand(1), Op.getOperand(2));
5530 case Intrinsic::aarch64_sve_lastb:
5531 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5532 Op.getOperand(1), Op.getOperand(2));
5533 case Intrinsic::aarch64_sve_rev:
5534 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5535 Op.getOperand(1));
5536 case Intrinsic::aarch64_sve_tbl:
5537 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5538 Op.getOperand(1), Op.getOperand(2));
5539 case Intrinsic::aarch64_sve_trn1:
5540 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5541 Op.getOperand(1), Op.getOperand(2));
5542 case Intrinsic::aarch64_sve_trn2:
5543 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5544 Op.getOperand(1), Op.getOperand(2));
5545 case Intrinsic::aarch64_sve_uzp1:
5546 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5547 Op.getOperand(1), Op.getOperand(2));
5548 case Intrinsic::aarch64_sve_uzp2:
5549 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5550 Op.getOperand(1), Op.getOperand(2));
5551 case Intrinsic::aarch64_sve_zip1:
5552 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5553 Op.getOperand(1), Op.getOperand(2));
5554 case Intrinsic::aarch64_sve_zip2:
5555 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5556 Op.getOperand(1), Op.getOperand(2));
5557 case Intrinsic::aarch64_sve_splice:
5558 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5559 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5560 case Intrinsic::aarch64_sve_ptrue:
5561 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5562 case Intrinsic::aarch64_sve_clz:
5563 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5564 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5565 case Intrinsic::aarch64_sme_cntsb:
5566 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5567 DAG.getConstant(1, dl, MVT::i32));
5568 case Intrinsic::aarch64_sme_cntsh: {
5569 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5570 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5571 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5572 }
5573 case Intrinsic::aarch64_sme_cntsw: {
5574 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5575 DAG.getConstant(1, dl, MVT::i32));
5576 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5577 DAG.getConstant(2, dl, MVT::i32));
5578 }
5579 case Intrinsic::aarch64_sme_cntsd: {
5580 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5581 DAG.getConstant(1, dl, MVT::i32));
5582 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5583 DAG.getConstant(3, dl, MVT::i32));
5584 }
5585 case Intrinsic::aarch64_sve_cnt: {
5586 SDValue Data = Op.getOperand(3);
5587 // CTPOP only supports integer operands.
5588 if (Data.getValueType().isFloatingPoint())
5589 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5590 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5591 Op.getOperand(2), Data, Op.getOperand(1));
5592 }
5593 case Intrinsic::aarch64_sve_dupq_lane:
5594 return LowerDUPQLane(Op, DAG);
5595 case Intrinsic::aarch64_sve_convert_from_svbool:
5596 if (Op.getValueType() == MVT::aarch64svcount)
5597 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5598 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5599 case Intrinsic::aarch64_sve_convert_to_svbool:
5600 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5601 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5602 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5603 case Intrinsic::aarch64_sve_fneg:
5604 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5605 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5606 case Intrinsic::aarch64_sve_frintp:
5607 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5608 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5609 case Intrinsic::aarch64_sve_frintm:
5610 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5611 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5612 case Intrinsic::aarch64_sve_frinti:
5613 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5614 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5615 case Intrinsic::aarch64_sve_frintx:
5616 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5617 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5618 case Intrinsic::aarch64_sve_frinta:
5619 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5620 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5621 case Intrinsic::aarch64_sve_frintn:
5622 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5623 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5624 case Intrinsic::aarch64_sve_frintz:
5625 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5626 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5627 case Intrinsic::aarch64_sve_ucvtf:
5629 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5630 Op.getOperand(1));
5631 case Intrinsic::aarch64_sve_scvtf:
5633 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5634 Op.getOperand(1));
5635 case Intrinsic::aarch64_sve_fcvtzu:
5637 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5638 Op.getOperand(1));
5639 case Intrinsic::aarch64_sve_fcvtzs:
5641 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5642 Op.getOperand(1));
5643 case Intrinsic::aarch64_sve_fsqrt:
5644 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5645 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5646 case Intrinsic::aarch64_sve_frecpx:
5647 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5648 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5649 case Intrinsic::aarch64_sve_frecpe_x:
5650 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5651 Op.getOperand(1));
5652 case Intrinsic::aarch64_sve_frecps_x:
5653 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5654 Op.getOperand(1), Op.getOperand(2));
5655 case Intrinsic::aarch64_sve_frsqrte_x:
5656 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5657 Op.getOperand(1));
5658 case Intrinsic::aarch64_sve_frsqrts_x:
5659 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5660 Op.getOperand(1), Op.getOperand(2));
5661 case Intrinsic::aarch64_sve_fabs:
5662 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5663 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5664 case Intrinsic::aarch64_sve_abs:
5665 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5666 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5667 case Intrinsic::aarch64_sve_neg:
5668 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5669 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5670 case Intrinsic::aarch64_sve_insr: {
5671 SDValue Scalar = Op.getOperand(2);
5672 EVT ScalarTy = Scalar.getValueType();
5673 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5674 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5675
5676 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5677 Op.getOperand(1), Scalar);
5678 }
5679 case Intrinsic::aarch64_sve_rbit:
5681 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5682 Op.getOperand(1));
5683 case Intrinsic::aarch64_sve_revb:
5684 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5685 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5686 case Intrinsic::aarch64_sve_revh:
5687 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5688 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5689 case Intrinsic::aarch64_sve_revw:
5690 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5691 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5692 case Intrinsic::aarch64_sve_revd:
5693 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5694 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5695 case Intrinsic::aarch64_sve_sxtb:
5696 return DAG.getNode(
5698 Op.getOperand(2), Op.getOperand(3),
5699 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5700 Op.getOperand(1));
5701 case Intrinsic::aarch64_sve_sxth:
5702 return DAG.getNode(
5704 Op.getOperand(2), Op.getOperand(3),
5705 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5706 Op.getOperand(1));
5707 case Intrinsic::aarch64_sve_sxtw:
5708 return DAG.getNode(
5710 Op.getOperand(2), Op.getOperand(3),
5711 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5712 Op.getOperand(1));
5713 case Intrinsic::aarch64_sve_uxtb:
5714 return DAG.getNode(
5716 Op.getOperand(2), Op.getOperand(3),
5717 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5718 Op.getOperand(1));
5719 case Intrinsic::aarch64_sve_uxth:
5720 return DAG.getNode(
5722 Op.getOperand(2), Op.getOperand(3),
5723 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5724 Op.getOperand(1));
5725 case Intrinsic::aarch64_sve_uxtw:
5726 return DAG.getNode(
5728 Op.getOperand(2), Op.getOperand(3),
5729 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5730 Op.getOperand(1));
5731 case Intrinsic::localaddress: {
5732 const auto &MF = DAG.getMachineFunction();
5733 const auto *RegInfo = Subtarget->getRegisterInfo();
5734 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5735 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5736 Op.getSimpleValueType());
5737 }
5738
5739 case Intrinsic::eh_recoverfp: {
5740 // FIXME: This needs to be implemented to correctly handle highly aligned
5741 // stack objects. For now we simply return the incoming FP. Refer D53541
5742 // for more details.
5743 SDValue FnOp = Op.getOperand(1);
5744 SDValue IncomingFPOp = Op.getOperand(2);
5745 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5746 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5747 if (!Fn)
5749 "llvm.eh.recoverfp must take a function as the first argument");
5750 return IncomingFPOp;
5751 }
5752
5753 case Intrinsic::aarch64_neon_vsri:
5754 case Intrinsic::aarch64_neon_vsli:
5755 case Intrinsic::aarch64_sve_sri:
5756 case Intrinsic::aarch64_sve_sli: {
5757 EVT Ty = Op.getValueType();
5758
5759 if (!Ty.isVector())
5760 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5761
5762 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5763
5764 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5765 IntNo == Intrinsic::aarch64_sve_sri;
5766 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5767 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5768 Op.getOperand(3));
5769 }
5770
5771 case Intrinsic::aarch64_neon_srhadd:
5772 case Intrinsic::aarch64_neon_urhadd:
5773 case Intrinsic::aarch64_neon_shadd:
5774 case Intrinsic::aarch64_neon_uhadd: {
5775 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5776 IntNo == Intrinsic::aarch64_neon_shadd);
5777 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5778 IntNo == Intrinsic::aarch64_neon_urhadd);
5779 unsigned Opcode = IsSignedAdd
5780 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5781 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5782 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5783 Op.getOperand(2));
5784 }
5785 case Intrinsic::aarch64_neon_saddlp:
5786 case Intrinsic::aarch64_neon_uaddlp: {
5787 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5790 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5791 }
5792 case Intrinsic::aarch64_neon_sdot:
5793 case Intrinsic::aarch64_neon_udot:
5794 case Intrinsic::aarch64_sve_sdot:
5795 case Intrinsic::aarch64_sve_udot: {
5796 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5797 IntNo == Intrinsic::aarch64_sve_udot)
5800 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5801 Op.getOperand(2), Op.getOperand(3));
5802 }
5803 case Intrinsic::get_active_lane_mask: {
5804 SDValue ID =
5805 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5806
5807 EVT VT = Op.getValueType();
5808 if (VT.isScalableVector())
5809 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
5810 Op.getOperand(2));
5811
5812 // We can use the SVE whilelo instruction to lower this intrinsic by
5813 // creating the appropriate sequence of scalable vector operations and
5814 // then extracting a fixed-width subvector from the scalable vector.
5815
5816 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
5817 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5818
5819 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
5820 Op.getOperand(1), Op.getOperand(2));
5821 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
5822 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
5823 DAG.getVectorIdxConstant(0, dl));
5824 }
5825 case Intrinsic::aarch64_neon_uaddlv: {
5826 EVT OpVT = Op.getOperand(1).getValueType();
5827 EVT ResVT = Op.getValueType();
5828 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5829 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5830 // In order to avoid insert_subvector, used v4i32 than v2i32.
5831 SDValue UADDLV =
5832 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5833 SDValue EXTRACT_VEC_ELT =
5834 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5835 DAG.getConstant(0, dl, MVT::i64));
5836 return EXTRACT_VEC_ELT;
5837 }
5838 return SDValue();
5839 }
5840 case Intrinsic::experimental_cttz_elts: {
5841 SDValue NewCttzElts =
5842 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5843
5844 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5845 }
5846 }
5847}
5848
5849bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5850 if (VT.getVectorElementType() == MVT::i8 ||
5851 VT.getVectorElementType() == MVT::i16) {
5852 EltTy = MVT::i32;
5853 return true;
5854 }
5855 return false;
5856}
5857
5858bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5859 EVT DataVT) const {
5860 const EVT IndexVT = Extend.getOperand(0).getValueType();
5861 // SVE only supports implicit extension of 32-bit indices.
5862 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5863 return false;
5864
5865 // Indices cannot be smaller than the main data type.
5866 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5867 return false;
5868
5869 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5870 // element container type, which would violate the previous clause.
5871 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5872}
5873
5874bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5875 EVT ExtVT = ExtVal.getValueType();
5876 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5877 return false;
5878
5879 // It may be worth creating extending masked loads if there are multiple
5880 // masked loads using the same predicate. That way we'll end up creating
5881 // extending masked loads that may then get split by the legaliser. This
5882 // results in just one set of predicate unpacks at the start, instead of
5883 // multiple sets of vector unpacks after each load.
5884 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5885 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5886 // Disable extending masked loads for fixed-width for now, since the code
5887 // quality doesn't look great.
5888 if (!ExtVT.isScalableVector())
5889 return false;
5890
5891 unsigned NumExtMaskedLoads = 0;
5892 for (auto *U : Ld->getMask()->uses())
5893 if (isa<MaskedLoadSDNode>(U))
5894 NumExtMaskedLoads++;
5895
5896 if (NumExtMaskedLoads <= 1)
5897 return false;
5898 }
5899 }
5900
5901 return true;
5902}
5903
5904unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5905 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5906 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5908 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5910 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5912 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5914 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5916 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5918 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5920 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5922 };
5923 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5924 return AddrModes.find(Key)->second;
5925}
5926
5927unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5928 switch (Opcode) {
5929 default:
5930 llvm_unreachable("unimplemented opcode");
5931 return Opcode;
5946 }
5947}
5948
5949SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5950 SelectionDAG &DAG) const {
5951 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5952
5953 SDLoc DL(Op);
5954 SDValue Chain = MGT->getChain();
5955 SDValue PassThru = MGT->getPassThru();
5956 SDValue Mask = MGT->getMask();
5957 SDValue BasePtr = MGT->getBasePtr();
5958 SDValue Index = MGT->getIndex();
5959 SDValue Scale = MGT->getScale();
5960 EVT VT = Op.getValueType();
5961 EVT MemVT = MGT->getMemoryVT();
5962 ISD::LoadExtType ExtType = MGT->getExtensionType();
5963 ISD::MemIndexType IndexType = MGT->getIndexType();
5964
5965 // SVE supports zero (and so undef) passthrough values only, everything else
5966 // must be handled manually by an explicit select on the load's output.
5967 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5968 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5969 SDValue Load =
5970 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5971 MGT->getMemOperand(), IndexType, ExtType);
5972 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5973 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5974 }
5975
5976 bool IsScaled = MGT->isIndexScaled();
5977 bool IsSigned = MGT->isIndexSigned();
5978
5979 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5980 // must be calculated before hand.
5981 uint64_t ScaleVal = Scale->getAsZExtVal();
5982 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5983 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5984 EVT IndexVT = Index.getValueType();
5985 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5986 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5987 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5988
5989 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5990 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5991 MGT->getMemOperand(), IndexType, ExtType);
5992 }
5993
5994 // Lower fixed length gather to a scalable equivalent.
5995 if (VT.isFixedLengthVector()) {
5996 assert(Subtarget->useSVEForFixedLengthVectors() &&
5997 "Cannot lower when not using SVE for fixed vectors!");
5998
5999 // NOTE: Handle floating-point as if integer then bitcast the result.
6001 MemVT = MemVT.changeVectorElementTypeToInteger();
6002
6003 // Find the smallest integer fixed length vector we can use for the gather.
6004 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6005 if (DataVT.getVectorElementType() == MVT::i64 ||
6006 Index.getValueType().getVectorElementType() == MVT::i64 ||
6007 Mask.getValueType().getVectorElementType() == MVT::i64)
6008 PromotedVT = VT.changeVectorElementType(MVT::i64);
6009
6010 // Promote vector operands except for passthrough, which we know is either
6011 // undef or zero, and thus best constructed directly.
6012 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6013 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6014 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6015
6016 // A promoted result type forces the need for an extending load.
6017 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6018 ExtType = ISD::EXTLOAD;
6019
6020 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6021
6022 // Convert fixed length vector operands to scalable.
6023 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6024 Index = convertToScalableVector(DAG, ContainerVT, Index);
6026 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6027 : DAG.getConstant(0, DL, ContainerVT);
6028
6029 // Emit equivalent scalable vector gather.
6030 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6031 SDValue Load =
6032 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6033 Ops, MGT->getMemOperand(), IndexType, ExtType);
6034
6035 // Extract fixed length data then convert to the required result type.
6036 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6037 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6038 if (VT.isFloatingPoint())
6039 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6040
6041 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6042 }
6043
6044 // Everything else is legal.
6045 return Op;
6046}
6047
6048SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6049 SelectionDAG &DAG) const {
6050 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6051
6052 SDLoc DL(Op);
6053 SDValue Chain = MSC->getChain();
6054 SDValue StoreVal = MSC->getValue();
6055 SDValue Mask = MSC->getMask();
6056 SDValue BasePtr = MSC->getBasePtr();
6057 SDValue Index = MSC->getIndex();
6058 SDValue Scale = MSC->getScale();
6059 EVT VT = StoreVal.getValueType();
6060 EVT MemVT = MSC->getMemoryVT();
6061 ISD::MemIndexType IndexType = MSC->getIndexType();
6062 bool Truncating = MSC->isTruncatingStore();
6063
6064 bool IsScaled = MSC->isIndexScaled();
6065 bool IsSigned = MSC->isIndexSigned();
6066
6067 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6068 // must be calculated before hand.
6069 uint64_t ScaleVal = Scale->getAsZExtVal();
6070 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6071 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6072 EVT IndexVT = Index.getValueType();
6073 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6074 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6075 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6076
6077 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6078 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6079 MSC->getMemOperand(), IndexType, Truncating);
6080 }
6081
6082 // Lower fixed length scatter to a scalable equivalent.
6083 if (VT.isFixedLengthVector()) {
6084 assert(Subtarget->useSVEForFixedLengthVectors() &&
6085 "Cannot lower when not using SVE for fixed vectors!");
6086
6087 // Once bitcast we treat floating-point scatters as if integer.
6088 if (VT.isFloatingPoint()) {
6090 MemVT = MemVT.changeVectorElementTypeToInteger();
6091 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6092 }
6093
6094 // Find the smallest integer fixed length vector we can use for the scatter.
6095 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6096 if (VT.getVectorElementType() == MVT::i64 ||
6097 Index.getValueType().getVectorElementType() == MVT::i64 ||
6098 Mask.getValueType().getVectorElementType() == MVT::i64)
6099 PromotedVT = VT.changeVectorElementType(MVT::i64);
6100
6101 // Promote vector operands.
6102 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6103 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6104 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6105 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6106
6107 // A promoted value type forces the need for a truncating store.
6108 if (PromotedVT != VT)
6109 Truncating = true;
6110
6111 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6112
6113 // Convert fixed length vector operands to scalable.
6114 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6115 Index = convertToScalableVector(DAG, ContainerVT, Index);
6117 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6118
6119 // Emit equivalent scalable vector scatter.
6120 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6121 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6122 MSC->getMemOperand(), IndexType, Truncating);
6123 }
6124
6125 // Everything else is legal.
6126 return Op;
6127}
6128
6129SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6130 SDLoc DL(Op);
6131 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6132 assert(LoadNode && "Expected custom lowering of a masked load node");
6133 EVT VT = Op->getValueType(0);
6134
6135 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6136 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6137
6138 SDValue PassThru = LoadNode->getPassThru();
6139 SDValue Mask = LoadNode->getMask();
6140
6141 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6142 return Op;
6143
6145 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6146 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6147 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6148 LoadNode->getExtensionType());
6149
6150 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6151
6152 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6153}
6154
6155// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6157 EVT VT, EVT MemVT,
6158 SelectionDAG &DAG) {
6159 assert(VT.isVector() && "VT should be a vector type");
6160 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6161
6162 SDValue Value = ST->getValue();
6163
6164 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6165 // the word lane which represent the v4i8 subvector. It optimizes the store
6166 // to:
6167 //
6168 // xtn v0.8b, v0.8h
6169 // str s0, [x0]
6170
6171 SDValue Undef = DAG.getUNDEF(MVT::i16);
6172 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6173 {Undef, Undef, Undef, Undef});
6174
6175 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6176 Value, UndefVec);
6177 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6178
6179 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6180 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6181 Trunc, DAG.getConstant(0, DL, MVT::i64));
6182
6183 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6184 ST->getBasePtr(), ST->getMemOperand());
6185}
6186
6187// Custom lowering for any store, vector or scalar and/or default or with
6188// a truncate operations. Currently only custom lower truncate operation
6189// from vector v4i16 to v4i8 or volatile stores of i128.
6190SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6191 SelectionDAG &DAG) const {
6192 SDLoc Dl(Op);
6193 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6194 assert (StoreNode && "Can only custom lower store nodes");
6195
6196 SDValue Value = StoreNode->getValue();
6197
6198 EVT VT = Value.getValueType();
6199 EVT MemVT = StoreNode->getMemoryVT();
6200
6201 if (VT.isVector()) {
6203 VT,
6204 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6205 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6206
6207 unsigned AS = StoreNode->getAddressSpace();
6208 Align Alignment = StoreNode->getAlign();
6209 if (Alignment < MemVT.getStoreSize() &&
6210 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6211 StoreNode->getMemOperand()->getFlags(),
6212 nullptr)) {
6213 return scalarizeVectorStore(StoreNode, DAG);
6214 }
6215
6216 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6217 MemVT == MVT::v4i8) {
6218 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6219 }
6220 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6221 // the custom lowering, as there are no un-paired non-temporal stores and
6222 // legalization will break up 256 bit inputs.
6224 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6225 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6226 (MemVT.getScalarSizeInBits() == 8u ||
6227 MemVT.getScalarSizeInBits() == 16u ||
6228 MemVT.getScalarSizeInBits() == 32u ||
6229 MemVT.getScalarSizeInBits() == 64u)) {
6230 SDValue Lo =
6233 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6234 SDValue Hi =
6237 StoreNode->getValue(),
6238 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6240 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6241 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6242 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6243 return Result;
6244 }
6245 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6246 return LowerStore128(Op, DAG);
6247 } else if (MemVT == MVT::i64x8) {
6248 SDValue Value = StoreNode->getValue();
6249 assert(Value->getValueType(0) == MVT::i64x8);
6250 SDValue Chain = StoreNode->getChain();
6251 SDValue Base = StoreNode->getBasePtr();
6252 EVT PtrVT = Base.getValueType();
6253 for (unsigned i = 0; i < 8; i++) {
6254 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6255 Value, DAG.getConstant(i, Dl, MVT::i32));
6256 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6257 DAG.getConstant(i * 8, Dl, PtrVT));
6258 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6259 StoreNode->getOriginalAlign());
6260 }
6261 return Chain;
6262 }
6263
6264 return SDValue();
6265}
6266
6267/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6268SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6269 SelectionDAG &DAG) const {
6270 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6271 assert(StoreNode->getMemoryVT() == MVT::i128);
6272 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6273
6274 bool IsStoreRelease =
6276 if (StoreNode->isAtomic())
6277 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6278 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6281
6282 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6283 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6284 ? StoreNode->getOperand(1)
6285 : StoreNode->getOperand(2);
6286 SDLoc DL(Op);
6287 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6288 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6289 if (DAG.getDataLayout().isBigEndian())
6290 std::swap(StoreValue.first, StoreValue.second);
6292 Opcode, DL, DAG.getVTList(MVT::Other),
6293 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6294 StoreNode->getBasePtr()},
6295 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6296 return Result;
6297}
6298
6299SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6300 SelectionDAG &DAG) const {
6301 SDLoc DL(Op);
6302 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6303 assert(LoadNode && "Expected custom lowering of a load node");
6304
6305 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6307 SDValue Base = LoadNode->getBasePtr();
6308 SDValue Chain = LoadNode->getChain();
6309 EVT PtrVT = Base.getValueType();
6310 for (unsigned i = 0; i < 8; i++) {
6311 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6312 DAG.getConstant(i * 8, DL, PtrVT));
6313 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6314 LoadNode->getPointerInfo(),
6315 LoadNode->getOriginalAlign());
6316 Ops.push_back(Part);
6317 Chain = SDValue(Part.getNode(), 1);
6318 }
6319 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6320 return DAG.getMergeValues({Loaded, Chain}, DL);
6321 }
6322
6323 // Custom lowering for extending v4i8 vector loads.
6324 EVT VT = Op->getValueType(0);
6325 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6326
6327 if (LoadNode->getMemoryVT() != MVT::v4i8)
6328 return SDValue();
6329
6330 unsigned ExtType;
6331 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6332 ExtType = ISD::SIGN_EXTEND;
6333 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6334 LoadNode->getExtensionType() == ISD::EXTLOAD)
6335 ExtType = ISD::ZERO_EXTEND;
6336 else
6337 return SDValue();
6338
6339 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6340 LoadNode->getBasePtr(), MachinePointerInfo());
6341 SDValue Chain = Load.getValue(1);
6342 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6343 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6344 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6345 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6346 DAG.getConstant(0, DL, MVT::i64));
6347 if (VT == MVT::v4i32)
6348 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6349 return DAG.getMergeValues({Ext, Chain}, DL);
6350}
6351
6352// Generate SUBS and CSEL for integer abs.
6353SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6354 MVT VT = Op.getSimpleValueType();
6355
6356 if (VT.isVector())
6357 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6358
6359 SDLoc DL(Op);
6360 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6361 Op.getOperand(0));
6362 // Generate SUBS & CSEL.
6363 SDValue Cmp =
6364 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6365 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6366 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6367 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6368 Cmp.getValue(1));
6369}
6370
6372 SDValue Chain = Op.getOperand(0);
6373 SDValue Cond = Op.getOperand(1);
6374 SDValue Dest = Op.getOperand(2);
6375
6377 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6378 SDLoc dl(Op);
6379 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6380 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6381 Cmp);
6382 }
6383
6384 return SDValue();
6385}
6386
6387// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6388// FSHL is converted to FSHR before deciding what to do with it
6390 SDValue Shifts = Op.getOperand(2);
6391 // Check if the shift amount is a constant
6392 // If opcode is FSHL, convert it to FSHR
6393 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6394 SDLoc DL(Op);
6395 MVT VT = Op.getSimpleValueType();
6396
6397 if (Op.getOpcode() == ISD::FSHL) {
6398 unsigned int NewShiftNo =
6399 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6400 return DAG.getNode(
6401 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6402 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6403 } else if (Op.getOpcode() == ISD::FSHR) {
6404 return Op;
6405 }
6406 }
6407
6408 return SDValue();
6409}
6410
6412 SDValue X = Op.getOperand(0);
6413 EVT XScalarTy = X.getValueType();
6414 SDValue Exp = Op.getOperand(1);
6415
6416 SDLoc DL(Op);
6417 EVT XVT, ExpVT;
6418 switch (Op.getSimpleValueType().SimpleTy) {
6419 default:
6420 return SDValue();
6421 case MVT::bf16:
6422 case MVT::f16:
6423 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6424 [[fallthrough]];
6425 case MVT::f32:
6426 XVT = MVT::nxv4f32;
6427 ExpVT = MVT::nxv4i32;
6428 break;
6429 case MVT::f64:
6430 XVT = MVT::nxv2f64;
6431 ExpVT = MVT::nxv2i64;
6432 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6433 break;
6434 }
6435
6436 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6437 SDValue VX =
6438 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6439 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6440 DAG.getUNDEF(ExpVT), Exp, Zero);
6441 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6442 AArch64SVEPredPattern::all);
6443 SDValue FScale =
6445 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6446 VPg, VX, VExp);
6447 SDValue Final =
6448 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6449 if (X.getValueType() != XScalarTy)
6450 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6451 DAG.getIntPtrConstant(1, SDLoc(Op)));
6452 return Final;
6453}
6454
6456 SelectionDAG &DAG) const {
6457 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6458 LLVM_DEBUG(Op.dump());
6459
6460 switch (Op.getOpcode()) {
6461 default:
6462 llvm_unreachable("unimplemented operand");
6463 return SDValue();
6464 case ISD::BITCAST:
6465 return LowerBITCAST(Op, DAG);
6466 case ISD::GlobalAddress:
6467 return LowerGlobalAddress(Op, DAG);
6469 return LowerGlobalTLSAddress(Op, DAG);
6470 case ISD::SETCC:
6471 case ISD::STRICT_FSETCC:
6473 return LowerSETCC(Op, DAG);
6474 case ISD::SETCCCARRY:
6475 return LowerSETCCCARRY(Op, DAG);
6476 case ISD::BRCOND:
6477 return LowerBRCOND(Op, DAG);
6478 case ISD::BR_CC:
6479 return LowerBR_CC(Op, DAG);
6480 case ISD::SELECT:
6481 return LowerSELECT(Op, DAG);
6482 case ISD::SELECT_CC:
6483 return LowerSELECT_CC(Op, DAG);
6484 case ISD::JumpTable:
6485 return LowerJumpTable(Op, DAG);
6486 case ISD::BR_JT:
6487 return LowerBR_JT(Op, DAG);
6488 case ISD::ConstantPool:
6489 return LowerConstantPool(Op, DAG);
6490 case ISD::BlockAddress:
6491 return LowerBlockAddress(Op, DAG);
6492 case ISD::VASTART:
6493 return LowerVASTART(Op, DAG);
6494 case ISD::VACOPY:
6495 return LowerVACOPY(Op, DAG);
6496 case ISD::VAARG:
6497 return LowerVAARG(Op, DAG);
6498 case ISD::UADDO_CARRY:
6499 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6500 case ISD::USUBO_CARRY:
6501 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6502 case ISD::SADDO_CARRY:
6503 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6504 case ISD::SSUBO_CARRY:
6505 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6506 case ISD::SADDO:
6507 case ISD::UADDO:
6508 case ISD::SSUBO:
6509 case ISD::USUBO:
6510 case ISD::SMULO:
6511 case ISD::UMULO:
6512 return LowerXALUO(Op, DAG);
6513 case ISD::FADD:
6514 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6515 case ISD::FSUB:
6516 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6517 case ISD::FMUL:
6518 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6519 case ISD::FMA:
6520 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6521 case ISD::FDIV:
6522 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6523 case ISD::FNEG:
6524 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6525 case ISD::FCEIL:
6526 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6527 case ISD::FFLOOR:
6528 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6529 case ISD::FNEARBYINT:
6530 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6531 case ISD::FRINT:
6532 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6533 case ISD::FROUND:
6534 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6535 case ISD::FROUNDEVEN:
6536 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6537 case ISD::FTRUNC:
6538 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6539 case ISD::FSQRT:
6540 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6541 case ISD::FABS:
6542 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6543 case ISD::FP_ROUND:
6545 return LowerFP_ROUND(Op, DAG);
6546 case ISD::FP_EXTEND:
6547 return LowerFP_EXTEND(Op, DAG);
6548 case ISD::FRAMEADDR:
6549 return LowerFRAMEADDR(Op, DAG);
6550 case ISD::SPONENTRY:
6551 return LowerSPONENTRY(Op, DAG);
6552 case ISD::RETURNADDR:
6553 return LowerRETURNADDR(Op, DAG);
6555 return LowerADDROFRETURNADDR(Op, DAG);
6557 return LowerCONCAT_VECTORS(Op, DAG);
6559 return LowerINSERT_VECTOR_ELT(Op, DAG);
6561 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6562 case ISD::BUILD_VECTOR:
6563 return LowerBUILD_VECTOR(Op, DAG);
6565 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6567 return LowerVECTOR_SHUFFLE(Op, DAG);
6568 case ISD::SPLAT_VECTOR:
6569 return LowerSPLAT_VECTOR(Op, DAG);
6571 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6573 return LowerINSERT_SUBVECTOR(Op, DAG);
6574 case ISD::SDIV:
6575 case ISD::UDIV:
6576 return LowerDIV(Op, DAG);
6577 case ISD::SMIN:
6578 case ISD::UMIN:
6579 case ISD::SMAX:
6580 case ISD::UMAX:
6581 return LowerMinMax(Op, DAG);
6582 case ISD::SRA:
6583 case ISD::SRL:
6584 case ISD::SHL:
6585 return LowerVectorSRA_SRL_SHL(Op, DAG);
6586 case ISD::SHL_PARTS:
6587 case ISD::SRL_PARTS:
6588 case ISD::SRA_PARTS:
6589 return LowerShiftParts(Op, DAG);
6590 case ISD::CTPOP:
6591 case ISD::PARITY:
6592 return LowerCTPOP_PARITY(Op, DAG);
6593 case ISD::FCOPYSIGN:
6594 return LowerFCOPYSIGN(Op, DAG);
6595 case ISD::OR:
6596 return LowerVectorOR(Op, DAG);
6597 case ISD::XOR:
6598 return LowerXOR(Op, DAG);
6599 case ISD::PREFETCH:
6600 return LowerPREFETCH(Op, DAG);
6601 case ISD::SINT_TO_FP:
6602 case ISD::UINT_TO_FP:
6605 return LowerINT_TO_FP(Op, DAG);
6606 case ISD::FP_TO_SINT:
6607 case ISD::FP_TO_UINT:
6610 return LowerFP_TO_INT(Op, DAG);
6613 return LowerFP_TO_INT_SAT(Op, DAG);
6614 case ISD::FSINCOS:
6615 return LowerFSINCOS(Op, DAG);
6616 case ISD::GET_ROUNDING:
6617 return LowerGET_ROUNDING(Op, DAG);
6618 case ISD::SET_ROUNDING:
6619 return LowerSET_ROUNDING(Op, DAG);
6620 case ISD::GET_FPMODE:
6621 return LowerGET_FPMODE(Op, DAG);
6622 case ISD::SET_FPMODE:
6623 return LowerSET_FPMODE(Op, DAG);
6624 case ISD::RESET_FPMODE:
6625 return LowerRESET_FPMODE(Op, DAG);
6626 case ISD::MUL:
6627 return LowerMUL(Op, DAG);
6628 case ISD::MULHS:
6629 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6630 case ISD::MULHU:
6631 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6633 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6635 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6637 return LowerINTRINSIC_VOID(Op, DAG);
6638 case ISD::ATOMIC_STORE:
6639 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6640 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6641 return LowerStore128(Op, DAG);
6642 }
6643 return SDValue();
6644 case ISD::STORE:
6645 return LowerSTORE(Op, DAG);
6646 case ISD::MSTORE:
6647 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6648 case ISD::MGATHER:
6649 return LowerMGATHER(Op, DAG);
6650 case ISD::MSCATTER:
6651 return LowerMSCATTER(Op, DAG);
6653 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6654 case ISD::VECREDUCE_ADD:
6655 case ISD::VECREDUCE_AND:
6656 case ISD::VECREDUCE_OR:
6657 case ISD::VECREDUCE_XOR:
6667 return LowerVECREDUCE(Op, DAG);
6669 return LowerATOMIC_LOAD_AND(Op, DAG);
6671 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6672 case ISD::VSCALE:
6673 return LowerVSCALE(Op, DAG);
6674 case ISD::ANY_EXTEND:
6675 case ISD::SIGN_EXTEND:
6676 case ISD::ZERO_EXTEND:
6677 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6679 // Only custom lower when ExtraVT has a legal byte based element type.
6680 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6681 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6682 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6683 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6684 return SDValue();
6685
6686 return LowerToPredicatedOp(Op, DAG,
6688 }
6689 case ISD::TRUNCATE:
6690 return LowerTRUNCATE(Op, DAG);
6691 case ISD::MLOAD:
6692 return LowerMLOAD(Op, DAG);
6693 case ISD::LOAD:
6694 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6695 !Subtarget->isNeonAvailable()))
6696 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6697 return LowerLOAD(Op, DAG);
6698 case ISD::ADD:
6699 case ISD::AND:
6700 case ISD::SUB:
6701 return LowerToScalableOp(Op, DAG);
6702 case ISD::FMAXIMUM:
6703 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6704 case ISD::FMAXNUM:
6705 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6706 case ISD::FMINIMUM:
6707 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6708 case ISD::FMINNUM:
6709 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6710 case ISD::VSELECT:
6711 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6712 case ISD::ABS:
6713 return LowerABS(Op, DAG);
6714 case ISD::ABDS:
6715 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6716 case ISD::ABDU:
6717 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6718 case ISD::AVGFLOORS:
6719 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6720 case ISD::AVGFLOORU:
6721 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6722 case ISD::AVGCEILS:
6723 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6724 case ISD::AVGCEILU:
6725 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6726 case ISD::BITREVERSE:
6727 return LowerBitreverse(Op, DAG);
6728 case ISD::BSWAP:
6729 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6730 case ISD::CTLZ:
6731 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6732 case ISD::CTTZ:
6733 return LowerCTTZ(Op, DAG);
6734 case ISD::VECTOR_SPLICE:
6735 return LowerVECTOR_SPLICE(Op, DAG);
6737 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6739 return LowerVECTOR_INTERLEAVE(Op, DAG);
6740 case ISD::LRINT:
6741 case ISD::LLRINT:
6742 if (Op.getValueType().isVector())
6743 return LowerVectorXRINT(Op, DAG);
6744 [[fallthrough]];
6745 case ISD::LROUND:
6746 case ISD::LLROUND: {
6747 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6748 Op.getOperand(0).getValueType() == MVT::bf16) &&
6749 "Expected custom lowering of rounding operations only for f16");
6750 SDLoc DL(Op);
6751 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6752 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6753 }
6754 case ISD::STRICT_LROUND:
6756 case ISD::STRICT_LRINT:
6757 case ISD::STRICT_LLRINT: {
6758 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6759 Op.getOperand(1).getValueType() == MVT::bf16) &&
6760 "Expected custom lowering of rounding operations only for f16");
6761 SDLoc DL(Op);
6762 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6763 {Op.getOperand(0), Op.getOperand(1)});
6764 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6765 {Ext.getValue(1), Ext.getValue(0)});
6766 }
6767 case ISD::WRITE_REGISTER: {
6768 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6769 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6770 SDLoc DL(Op);
6771
6772 SDValue Chain = Op.getOperand(0);
6773 SDValue SysRegName = Op.getOperand(1);
6774 std::pair<SDValue, SDValue> Pair =
6775 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6776
6777 // chain = MSRR(chain, sysregname, lo, hi)
6778 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6779 SysRegName, Pair.first, Pair.second);
6780
6781 return Result;
6782 }
6783 case ISD::FSHL:
6784 case ISD::FSHR:
6785 return LowerFunnelShift(Op, DAG);
6786 case ISD::FLDEXP:
6787 return LowerFLDEXP(Op, DAG);
6789 return LowerVECTOR_HISTOGRAM(Op, DAG);
6790 }
6791}
6792
6794 return !Subtarget->useSVEForFixedLengthVectors();
6795}
6796
6798 EVT VT, bool OverrideNEON) const {
6799 if (!VT.isFixedLengthVector() || !VT.isSimple())
6800 return false;
6801
6802 // Don't use SVE for vectors we cannot scalarize if required.
6803 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6804 // Fixed length predicates should be promoted to i8.
6805 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6806 case MVT::i1:
6807 default:
6808 return false;
6809 case MVT::i8:
6810 case MVT::i16:
6811 case MVT::i32:
6812 case MVT::i64:
6813 case MVT::f16:
6814 case MVT::f32:
6815 case MVT::f64:
6816 break;
6817 }
6818
6819 // NEON-sized vectors can be emulated using SVE instructions.
6820 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6821 return Subtarget->hasSVEorSME();
6822
6823 // Ensure NEON MVTs only belong to a single register class.
6824 if (VT.getFixedSizeInBits() <= 128)
6825 return false;
6826
6827 // Ensure wider than NEON code generation is enabled.
6828 if (!Subtarget->useSVEForFixedLengthVectors())
6829 return false;
6830
6831 // Don't use SVE for types that don't fit.
6832 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6833 return false;
6834
6835 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6836 // the base fixed length SVE support in place.
6837 if (!VT.isPow2VectorType())
6838 return false;
6839
6840 return true;
6841}
6842
6843//===----------------------------------------------------------------------===//
6844// Calling Convention Implementation
6845//===----------------------------------------------------------------------===//
6846
6847static unsigned getIntrinsicID(const SDNode *N) {
6848 unsigned Opcode = N->getOpcode();
6849 switch (Opcode) {
6850 default:
6853 unsigned IID = N->getConstantOperandVal(0);
6854 if (IID < Intrinsic::num_intrinsics)
6855 return IID;
6857 }
6858 }
6859}
6860
6862 SDValue N1) const {
6863 if (!N0.hasOneUse())
6864 return false;
6865
6866 unsigned IID = getIntrinsicID(N1.getNode());
6867 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6868 if (IID == Intrinsic::aarch64_neon_umull ||
6869 N1.getOpcode() == AArch64ISD::UMULL ||
6870 IID == Intrinsic::aarch64_neon_smull ||
6872 return N0.getOpcode() != ISD::ADD;
6873
6874 return true;
6875}
6876
6877/// Selects the correct CCAssignFn for a given CallingConvention value.
6879 bool IsVarArg) const {
6880 switch (CC) {
6881 default:
6882 report_fatal_error("Unsupported calling convention.");
6883 case CallingConv::GHC:
6884 return CC_AArch64_GHC;
6885 case CallingConv::C:
6886 case CallingConv::Fast:
6890 case CallingConv::Swift:
6892 case CallingConv::Tail:
6893 case CallingConv::GRAAL:
6894 if (Subtarget->isTargetWindows()) {
6895 if (IsVarArg) {
6896 if (Subtarget->isWindowsArm64EC())
6899 }
6900 return CC_AArch64_Win64PCS;
6901 }
6902 if (!Subtarget->isTargetDarwin())
6903 return CC_AArch64_AAPCS;
6904 if (!IsVarArg)
6905 return CC_AArch64_DarwinPCS;
6908 case CallingConv::Win64:
6909 if (IsVarArg) {
6910 if (Subtarget->isWindowsArm64EC())
6913 }
6914 return CC_AArch64_Win64PCS;
6916 if (Subtarget->isWindowsArm64EC())
6923 return CC_AArch64_AAPCS;
6928 }
6929}
6930
6931CCAssignFn *
6933 switch (CC) {
6934 default:
6935 return RetCC_AArch64_AAPCS;
6939 if (Subtarget->isWindowsArm64EC())
6941 return RetCC_AArch64_AAPCS;
6942 }
6943}
6944
6945
6946unsigned
6947AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6948 SelectionDAG &DAG) const {
6950 MachineFrameInfo &MFI = MF.getFrameInfo();
6951
6952 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6953 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6954 DAG.getConstant(1, DL, MVT::i32));
6955 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6956 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6957 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6958 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6959 Chain = Buffer.getValue(1);
6960 MFI.CreateVariableSizedObject(Align(1), nullptr);
6961
6962 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6963 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6964
6965 // Store the buffer pointer to the TPIDR2 stack object.
6968 TPIDR2Obj,
6970 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6971
6972 // Set the reserved bytes (10-15) to zero
6973 EVT PtrTy = Ptr.getValueType();
6974 SDValue ReservedPtr =
6975 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6976 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6977 MPI);
6978 ReservedPtr =
6979 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6980 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6981 MPI);
6982
6983 return TPIDR2Obj;
6984}
6985
6986static bool isPassedInFPR(EVT VT) {
6987 return VT.isFixedLengthVector() ||
6988 (VT.isFloatingPoint() && !VT.isScalableVector());
6989}
6990
6991SDValue AArch64TargetLowering::LowerFormalArguments(
6992 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6993 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6994 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6996 const Function &F = MF.getFunction();
6997 MachineFrameInfo &MFI = MF.getFrameInfo();
6998 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6999 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7000 (isVarArg && Subtarget->isWindowsArm64EC());
7002
7004 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7006 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7007 FuncInfo->setIsSVECC(true);
7008
7009 // Assign locations to all of the incoming arguments.
7011 DenseMap<unsigned, SDValue> CopiedRegs;
7012 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7013
7014 // At this point, Ins[].VT may already be promoted to i32. To correctly
7015 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7016 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7017 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7018 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7019 // LocVT.
7020 unsigned NumArgs = Ins.size();
7021 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7022 unsigned CurArgIdx = 0;
7023 for (unsigned i = 0; i != NumArgs; ++i) {
7024 MVT ValVT = Ins[i].VT;
7025 if (Ins[i].isOrigArg()) {
7026 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7027 CurArgIdx = Ins[i].getOrigArgIndex();
7028
7029 // Get type of the original argument.
7030 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7031 /*AllowUnknown*/ true);
7032 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7033 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7034 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7035 ValVT = MVT::i8;
7036 else if (ActualMVT == MVT::i16)
7037 ValVT = MVT::i16;
7038 }
7039 bool UseVarArgCC = false;
7040 if (IsWin64)
7041 UseVarArgCC = isVarArg;
7042 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7043 bool Res =
7044 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7045 assert(!Res && "Call operand has unhandled type");
7046 (void)Res;
7047 }
7048
7050 bool IsLocallyStreaming =
7051 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7052 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7053 SDValue Glue = Chain.getValue(1);
7054
7055 SmallVector<SDValue, 16> ArgValues;
7056 unsigned ExtraArgLocs = 0;
7057 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7058 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7059
7060 if (Ins[i].Flags.isByVal()) {
7061 // Byval is used for HFAs in the PCS, but the system should work in a
7062 // non-compliant manner for larger structs.
7063 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7064 int Size = Ins[i].Flags.getByValSize();
7065 unsigned NumRegs = (Size + 7) / 8;
7066
7067 // FIXME: This works on big-endian for composite byvals, which are the common
7068 // case. It should also work for fundamental types too.
7069 unsigned FrameIdx =
7070 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7071 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7072 InVals.push_back(FrameIdxN);
7073
7074 continue;
7075 }
7076
7077 if (Ins[i].Flags.isSwiftAsync())
7079
7080 SDValue ArgValue;
7081 if (VA.isRegLoc()) {
7082 // Arguments stored in registers.
7083 EVT RegVT = VA.getLocVT();
7084 const TargetRegisterClass *RC;
7085
7086 if (RegVT == MVT::i32)
7087 RC = &AArch64::GPR32RegClass;
7088 else if (RegVT == MVT::i64)
7089 RC = &AArch64::GPR64RegClass;
7090 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7091 RC = &AArch64::FPR16RegClass;
7092 else if (RegVT == MVT::f32)
7093 RC = &AArch64::FPR32RegClass;
7094 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7095 RC = &AArch64::FPR64RegClass;
7096 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7097 RC = &AArch64::FPR128RegClass;
7098 else if (RegVT.isScalableVector() &&
7099 RegVT.getVectorElementType() == MVT::i1) {
7100 FuncInfo->setIsSVECC(true);
7101 RC = &AArch64::PPRRegClass;
7102 } else if (RegVT == MVT::aarch64svcount) {
7103 FuncInfo->setIsSVECC(true);
7104 RC = &AArch64::PPRRegClass;
7105 } else if (RegVT.isScalableVector()) {
7106 FuncInfo->setIsSVECC(true);
7107 RC = &AArch64::ZPRRegClass;
7108 } else
7109 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7110
7111 // Transform the arguments in physical registers into virtual ones.
7112 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7113
7114 if (IsLocallyStreaming) {
7115 // LocallyStreamingFunctions must insert the SMSTART in the correct
7116 // position, so we use Glue to ensure no instructions can be scheduled
7117 // between the chain of:
7118 // t0: ch,glue = EntryNode
7119 // t1: res,ch,glue = CopyFromReg
7120 // ...
7121 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7122 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7123 // ^^^^^^
7124 // This will be the new Chain/Root node.
7125 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7126 Glue = ArgValue.getValue(2);
7127 if (isPassedInFPR(ArgValue.getValueType())) {
7128 ArgValue =
7130 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7131 {ArgValue, Glue});
7132 Glue = ArgValue.getValue(1);
7133 }
7134 } else
7135 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7136
7137 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7138 // to 64 bits. Insert an assert[sz]ext to capture this, then
7139 // truncate to the right size.
7140 switch (VA.getLocInfo()) {
7141 default:
7142 llvm_unreachable("Unknown loc info!");
7143 case CCValAssign::Full:
7144 break;
7146 assert(
7147 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7148 "Indirect arguments should be scalable on most subtargets");
7149 break;
7150 case CCValAssign::BCvt:
7151 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7152 break;
7153 case CCValAssign::AExt:
7154 case CCValAssign::SExt:
7155 case CCValAssign::ZExt:
7156 break;
7158 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7159 DAG.getConstant(32, DL, RegVT));
7160 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7161 break;
7162 }
7163 } else { // VA.isRegLoc()
7164 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7165 unsigned ArgOffset = VA.getLocMemOffset();
7166 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7167 ? VA.getLocVT().getSizeInBits()
7168 : VA.getValVT().getSizeInBits()) / 8;
7169
7170 uint32_t BEAlign = 0;
7171 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7172 !Ins[i].Flags.isInConsecutiveRegs())
7173 BEAlign = 8 - ArgSize;
7174
7175 SDValue FIN;
7176 MachinePointerInfo PtrInfo;
7177 if (StackViaX4) {
7178 // In both the ARM64EC varargs convention and the thunk convention,
7179 // arguments on the stack are accessed relative to x4, not sp. In
7180 // the thunk convention, there's an additional offset of 32 bytes
7181 // to account for the shadow store.
7182 unsigned ObjOffset = ArgOffset + BEAlign;
7183 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7184 ObjOffset += 32;
7185 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7186 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7187 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7188 DAG.getConstant(ObjOffset, DL, MVT::i64));
7190 } else {
7191 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7192
7193 // Create load nodes to retrieve arguments from the stack.
7194 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7195 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7196 }
7197
7198 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7200 MVT MemVT = VA.getValVT();
7201
7202 switch (VA.getLocInfo()) {
7203 default:
7204 break;
7205 case CCValAssign::Trunc:
7206 case CCValAssign::BCvt:
7207 MemVT = VA.getLocVT();
7208 break;
7211 Subtarget->isWindowsArm64EC()) &&
7212 "Indirect arguments should be scalable on most subtargets");
7213 MemVT = VA.getLocVT();
7214 break;
7215 case CCValAssign::SExt:
7216 ExtType = ISD::SEXTLOAD;
7217 break;
7218 case CCValAssign::ZExt:
7219 ExtType = ISD::ZEXTLOAD;
7220 break;
7221 case CCValAssign::AExt:
7222 ExtType = ISD::EXTLOAD;
7223 break;
7224 }
7225
7226 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7227 MemVT);
7228 }
7229
7230 if (VA.getLocInfo() == CCValAssign::Indirect) {
7231 assert((VA.getValVT().isScalableVT() ||
7232 Subtarget->isWindowsArm64EC()) &&
7233 "Indirect arguments should be scalable on most subtargets");
7234
7235 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7236 unsigned NumParts = 1;
7237 if (Ins[i].Flags.isInConsecutiveRegs()) {
7238 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7239 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7240 ++NumParts;
7241 }
7242
7243 MVT PartLoad = VA.getValVT();
7244 SDValue Ptr = ArgValue;
7245
7246 // Ensure we generate all loads for each tuple part, whilst updating the
7247 // pointer after each load correctly using vscale.
7248 while (NumParts > 0) {
7249 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7250 InVals.push_back(ArgValue);
7251 NumParts--;
7252 if (NumParts > 0) {
7253 SDValue BytesIncrement;
7254 if (PartLoad.isScalableVector()) {
7255 BytesIncrement = DAG.getVScale(
7256 DL, Ptr.getValueType(),
7257 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7258 } else {
7259 BytesIncrement = DAG.getConstant(
7260 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7261 Ptr.getValueType());
7262 }
7264 Flags.setNoUnsignedWrap(true);
7265 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7266 BytesIncrement, Flags);
7267 ExtraArgLocs++;
7268 i++;
7269 }
7270 }
7271 } else {
7272 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7273 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7274 ArgValue, DAG.getValueType(MVT::i32));
7275
7276 // i1 arguments are zero-extended to i8 by the caller. Emit a
7277 // hint to reflect this.
7278 if (Ins[i].isOrigArg()) {
7279 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7280 if (OrigArg->getType()->isIntegerTy(1)) {
7281 if (!Ins[i].Flags.isZExt()) {
7282 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7283 ArgValue.getValueType(), ArgValue);
7284 }
7285 }
7286 }
7287
7288 InVals.push_back(ArgValue);
7289 }
7290 }
7291 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7292
7293 // Insert the SMSTART if this is a locally streaming function and
7294 // make sure it is Glued to the last CopyFromReg value.
7295 if (IsLocallyStreaming) {
7296 SDValue PStateSM;
7297 if (Attrs.hasStreamingCompatibleInterface()) {
7298 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7301 FuncInfo->setPStateSMReg(Reg);
7302 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7303 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7305 } else
7306 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7308
7309 // Ensure that the SMSTART happens after the CopyWithChain such that its
7310 // chain result is used.
7311 for (unsigned I=0; I<InVals.size(); ++I) {
7313 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7314 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7315 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7316 InVals[I].getValueType());
7317 }
7318 }
7319
7320 // varargs
7321 if (isVarArg) {
7322 if (!Subtarget->isTargetDarwin() || IsWin64) {
7323 // The AAPCS variadic function ABI is identical to the non-variadic
7324 // one. As a result there may be more arguments in registers and we should
7325 // save them for future reference.
7326 // Win64 variadic functions also pass arguments in registers, but all float
7327 // arguments are passed in integer registers.
7328 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7329 }
7330
7331 // This will point to the next argument passed via stack.
7332 unsigned VarArgsOffset = CCInfo.getStackSize();
7333 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7334 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7335 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7336 FuncInfo->setVarArgsStackIndex(
7337 MFI.CreateFixedObject(4, VarArgsOffset, true));
7338
7339 if (MFI.hasMustTailInVarArgFunc()) {
7340 SmallVector<MVT, 2> RegParmTypes;
7341 RegParmTypes.push_back(MVT::i64);
7342 RegParmTypes.push_back(MVT::f128);
7343 // Compute the set of forwarded registers. The rest are scratch.
7345 FuncInfo->getForwardedMustTailRegParms();
7346 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7348
7349 // Conservatively forward X8, since it might be used for aggregate return.
7350 if (!CCInfo.isAllocated(AArch64::X8)) {
7351 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7352 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7353 }
7354 }
7355 }
7356
7357 // On Windows, InReg pointers must be returned, so record the pointer in a
7358 // virtual register at the start of the function so it can be returned in the
7359 // epilogue.
7360 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7361 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7362 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7363 Ins[I].Flags.isInReg()) &&
7364 Ins[I].Flags.isSRet()) {
7365 assert(!FuncInfo->getSRetReturnReg());
7366
7367 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7368 Register Reg =
7370 FuncInfo->setSRetReturnReg(Reg);
7371
7372 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7373 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7374 break;
7375 }
7376 }
7377 }
7378
7379 unsigned StackArgSize = CCInfo.getStackSize();
7380 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7381 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7382 // This is a non-standard ABI so by fiat I say we're allowed to make full
7383 // use of the stack area to be popped, which must be aligned to 16 bytes in
7384 // any case:
7385 StackArgSize = alignTo(StackArgSize, 16);
7386
7387 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7388 // a multiple of 16.
7389 FuncInfo->setArgumentStackToRestore(StackArgSize);
7390
7391 // This realignment carries over to the available bytes below. Our own
7392 // callers will guarantee the space is free by giving an aligned value to
7393 // CALLSEQ_START.
7394 }
7395 // Even if we're not expected to free up the space, it's useful to know how
7396 // much is there while considering tail calls (because we can reuse it).
7397 FuncInfo->setBytesInStackArgArea(StackArgSize);
7398
7399 if (Subtarget->hasCustomCallingConv())
7401
7402 // Conservatively assume the function requires the lazy-save mechanism.
7403 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7404 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7405 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7406 }
7407
7408 return Chain;
7409}
7410
7411void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7412 SelectionDAG &DAG,
7413 const SDLoc &DL,
7414 SDValue &Chain) const {
7416 MachineFrameInfo &MFI = MF.getFrameInfo();
7418 auto PtrVT = getPointerTy(DAG.getDataLayout());
7419 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7420
7422
7424 unsigned NumGPRArgRegs = GPRArgRegs.size();
7425 if (Subtarget->isWindowsArm64EC()) {
7426 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7427 // functions.
7428 NumGPRArgRegs = 4;
7429 }
7430 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7431
7432 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7433 int GPRIdx = 0;
7434 if (GPRSaveSize != 0) {
7435 if (IsWin64) {
7436 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7437 if (GPRSaveSize & 15)
7438 // The extra size here, if triggered, will always be 8.
7439 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7440 } else
7441 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7442
7443 SDValue FIN;
7444 if (Subtarget->isWindowsArm64EC()) {
7445 // With the Arm64EC ABI, we reserve the save area as usual, but we
7446 // compute its address relative to x4. For a normal AArch64->AArch64
7447 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7448 // different address.
7449 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7450 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7451 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7452 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7453 } else {
7454 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7455 }
7456
7457 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7458 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7459 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7460 SDValue Store =
7461 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7463 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7464 : MachinePointerInfo::getStack(MF, i * 8));
7465 MemOps.push_back(Store);
7466 FIN =
7467 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7468 }
7469 }
7470 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7471 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7472
7473 if (Subtarget->hasFPARMv8() && !IsWin64) {
7475 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7476 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7477
7478 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7479 int FPRIdx = 0;
7480 if (FPRSaveSize != 0) {
7481 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7482
7483 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7484
7485 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7486 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7487 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7488
7489 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7490 MachinePointerInfo::getStack(MF, i * 16));
7491 MemOps.push_back(Store);
7492 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7493 DAG.getConstant(16, DL, PtrVT));
7494 }
7495 }
7496 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7497 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7498 }
7499
7500 if (!MemOps.empty()) {
7501 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7502 }
7503}
7504
7505/// LowerCallResult - Lower the result values of a call into the
7506/// appropriate copies out of appropriate physical registers.
7507SDValue AArch64TargetLowering::LowerCallResult(
7508 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7509 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7510 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7511 SDValue ThisVal, bool RequiresSMChange) const {
7512 DenseMap<unsigned, SDValue> CopiedRegs;
7513 // Copy all of the result registers out of their specified physreg.
7514 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7515 CCValAssign VA = RVLocs[i];
7516
7517 // Pass 'this' value directly from the argument to return value, to avoid
7518 // reg unit interference
7519 if (i == 0 && isThisReturn) {
7520 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7521 "unexpected return calling convention register assignment");
7522 InVals.push_back(ThisVal);
7523 continue;
7524 }
7525
7526 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7527 // allows one use of a physreg per block.
7528 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7529 if (!Val) {
7530 Val =
7531 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7532 Chain = Val.getValue(1);
7533 InGlue = Val.getValue(2);
7534 CopiedRegs[VA.getLocReg()] = Val;
7535 }
7536
7537 switch (VA.getLocInfo()) {
7538 default:
7539 llvm_unreachable("Unknown loc info!");
7540 case CCValAssign::Full:
7541 break;
7542 case CCValAssign::BCvt:
7543 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7544 break;
7546 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7547 DAG.getConstant(32, DL, VA.getLocVT()));
7548 [[fallthrough]];
7549 case CCValAssign::AExt:
7550 [[fallthrough]];
7551 case CCValAssign::ZExt:
7552 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7553 break;
7554 }
7555
7556 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7558 Val);
7559
7560 InVals.push_back(Val);
7561 }
7562
7563 return Chain;
7564}
7565
7566/// Return true if the calling convention is one that we can guarantee TCO for.
7567static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7568 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7570}
7571
7572/// Return true if we might ever do TCO for calls with this calling convention.
7574 switch (CC) {
7575 case CallingConv::C:
7579 case CallingConv::Swift:
7581 case CallingConv::Tail:
7582 case CallingConv::Fast:
7583 return true;
7584 default:
7585 return false;
7586 }
7587}
7588
7590 const AArch64Subtarget *Subtarget,
7592 CCState &CCInfo) {
7593 const SelectionDAG &DAG = CLI.DAG;
7594 CallingConv::ID CalleeCC = CLI.CallConv;
7595 bool IsVarArg = CLI.IsVarArg;
7596 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7597 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7598
7599 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7600 // for the shadow store.
7601 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7602 CCInfo.AllocateStack(32, Align(16));
7603
7604 unsigned NumArgs = Outs.size();
7605 for (unsigned i = 0; i != NumArgs; ++i) {
7606 MVT ArgVT = Outs[i].VT;
7607 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7608
7609 bool UseVarArgCC = false;
7610 if (IsVarArg) {
7611 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7612 // too, so use the vararg CC to force them to integer registers.
7613 if (IsCalleeWin64) {
7614 UseVarArgCC = true;
7615 } else {
7616 UseVarArgCC = !Outs[i].IsFixed;
7617 }
7618 }
7619
7620 if (!UseVarArgCC) {
7621 // Get type of the original argument.
7622 EVT ActualVT =
7623 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7624 /*AllowUnknown*/ true);
7625 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7626 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7627 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7628 ArgVT = MVT::i8;
7629 else if (ActualMVT == MVT::i16)
7630 ArgVT = MVT::i16;
7631 }
7632
7633 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7634 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7635 assert(!Res && "Call operand has unhandled type");
7636 (void)Res;
7637 }
7638}
7639
7640bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7641 const CallLoweringInfo &CLI) const {
7642 CallingConv::ID CalleeCC = CLI.CallConv;
7643 if (!mayTailCallThisCC(CalleeCC))
7644 return false;
7645
7646 SDValue Callee = CLI.Callee;
7647 bool IsVarArg = CLI.IsVarArg;
7648 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7649 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7650 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7651 const SelectionDAG &DAG = CLI.DAG;
7653 const Function &CallerF = MF.getFunction();
7654 CallingConv::ID CallerCC = CallerF.getCallingConv();
7655
7656 // SME Streaming functions are not eligible for TCO as they may require
7657 // the streaming mode or ZA to be restored after returning from the call.
7658 SMEAttrs CallerAttrs(MF.getFunction());
7659 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7660 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7661 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7662 CallerAttrs.hasStreamingBody())
7663 return false;
7664
7665 // Functions using the C or Fast calling convention that have an SVE signature
7666 // preserve more registers and should assume the SVE_VectorCall CC.
7667 // The check for matching callee-saved regs will determine whether it is
7668 // eligible for TCO.
7669 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7672
7673 bool CCMatch = CallerCC == CalleeCC;
7674
7675 // When using the Windows calling convention on a non-windows OS, we want
7676 // to back up and restore X18 in such functions; we can't do a tail call
7677 // from those functions.
7678 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7679 CalleeCC != CallingConv::Win64)
7680 return false;
7681
7682 // Byval parameters hand the function a pointer directly into the stack area
7683 // we want to reuse during a tail call. Working around this *is* possible (see
7684 // X86) but less efficient and uglier in LowerCall.
7685 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7686 e = CallerF.arg_end();
7687 i != e; ++i) {
7688 if (i->hasByValAttr())
7689 return false;
7690
7691 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7692 // In this case, it is necessary to save/restore X0 in the callee. Tail
7693 // call opt interferes with this. So we disable tail call opt when the
7694 // caller has an argument with "inreg" attribute.
7695
7696 // FIXME: Check whether the callee also has an "inreg" argument.
7697 if (i->hasInRegAttr())
7698 return false;
7699 }
7700
7701 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7702 return CCMatch;
7703
7704 // Externally-defined functions with weak linkage should not be
7705 // tail-called on AArch64 when the OS does not support dynamic
7706 // pre-emption of symbols, as the AAELF spec requires normal calls
7707 // to undefined weak functions to be replaced with a NOP or jump to the
7708 // next instruction. The behaviour of branch instructions in this
7709 // situation (as used for tail calls) is implementation-defined, so we
7710 // cannot rely on the linker replacing the tail call with a return.
7711 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7712 const GlobalValue *GV = G->getGlobal();
7714 if (GV->hasExternalWeakLinkage() &&
7715 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7716 return false;
7717 }
7718
7719 // Now we search for cases where we can use a tail call without changing the
7720 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7721 // concept.
7722
7723 // I want anyone implementing a new calling convention to think long and hard
7724 // about this assert.
7725 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7726 "Unexpected variadic calling convention");
7727
7728 LLVMContext &C = *DAG.getContext();
7729 // Check that the call results are passed in the same way.
7730 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7731 CCAssignFnForCall(CalleeCC, IsVarArg),
7732 CCAssignFnForCall(CallerCC, IsVarArg)))
7733 return false;
7734 // The callee has to preserve all registers the caller needs to preserve.
7735 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7736 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7737 if (!CCMatch) {
7738 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7739 if (Subtarget->hasCustomCallingConv()) {
7740 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7741 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7742 }
7743 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7744 return false;
7745 }
7746
7747 // Nothing more to check if the callee is taking no arguments
7748 if (Outs.empty())
7749 return true;
7750
7752 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7753
7754 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7755
7756 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7757 // When we are musttail, additional checks have been done and we can safely ignore this check
7758 // At least two cases here: if caller is fastcc then we can't have any
7759 // memory arguments (we'd be expected to clean up the stack afterwards). If
7760 // caller is C then we could potentially use its argument area.
7761
7762 // FIXME: for now we take the most conservative of these in both cases:
7763 // disallow all variadic memory operands.
7764 for (const CCValAssign &ArgLoc : ArgLocs)
7765 if (!ArgLoc.isRegLoc())
7766 return false;
7767 }
7768
7769 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7770
7771 // If any of the arguments is passed indirectly, it must be SVE, so the
7772 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7773 // allocate space on the stack. That is why we determine this explicitly here
7774 // the call cannot be a tailcall.
7775 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7776 assert((A.getLocInfo() != CCValAssign::Indirect ||
7777 A.getValVT().isScalableVector() ||
7778 Subtarget->isWindowsArm64EC()) &&
7779 "Expected value to be scalable");
7780 return A.getLocInfo() == CCValAssign::Indirect;
7781 }))
7782 return false;
7783
7784 // If the stack arguments for this call do not fit into our own save area then
7785 // the call cannot be made tail.
7786 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7787 return false;
7788
7789 const MachineRegisterInfo &MRI = MF.getRegInfo();
7790 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7791 return false;
7792
7793 return true;
7794}
7795
7796SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7797 SelectionDAG &DAG,
7798 MachineFrameInfo &MFI,
7799 int ClobberedFI) const {
7800 SmallVector<SDValue, 8> ArgChains;
7801 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7802 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7803
7804 // Include the original chain at the beginning of the list. When this is
7805 // used by target LowerCall hooks, this helps legalize find the
7806 // CALLSEQ_BEGIN node.
7807 ArgChains.push_back(Chain);
7808
7809 // Add a chain value for each stack argument corresponding
7810 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7811 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7812 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7813 if (FI->getIndex() < 0) {
7814 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7815 int64_t InLastByte = InFirstByte;
7816 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7817
7818 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7819 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7820 ArgChains.push_back(SDValue(L, 1));
7821 }
7822
7823 // Build a tokenfactor for all the chains.
7824 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7825}
7826
7827bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7828 bool TailCallOpt) const {
7829 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7830 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7831}
7832
7833// Check if the value is zero-extended from i1 to i8
7834static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7835 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7836 if (SizeInBits < 8)
7837 return false;
7838
7839 APInt RequredZero(SizeInBits, 0xFE);
7840 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7841 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7842 return ZExtBool;
7843}
7844
7845void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7846 SDNode *Node) const {
7847 // Live-in physreg copies that are glued to SMSTART are applied as
7848 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7849 // register allocator to pass call args in callee saved regs, without extra
7850 // copies to avoid these fake clobbers of actually-preserved GPRs.
7851 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7852 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7853 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7854 if (MachineOperand &MO = MI.getOperand(I);
7855 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7856 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7857 AArch64::GPR64RegClass.contains(MO.getReg())))
7858 MI.removeOperand(I);
7859
7860 // The SVE vector length can change when entering/leaving streaming mode.
7861 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7862 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7863 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7864 /*IsImplicit=*/true));
7865 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7866 /*IsImplicit=*/true));
7867 }
7868 }
7869
7870 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7871 // have nothing to do with VG, were it not that they are used to materialise a
7872 // frame-address. If they contain a frame-index to a scalable vector, this
7873 // will likely require an ADDVL instruction to materialise the address, thus
7874 // reading VG.
7875 const MachineFunction &MF = *MI.getMF();
7877 (MI.getOpcode() == AArch64::ADDXri ||
7878 MI.getOpcode() == AArch64::SUBXri)) {
7879 const MachineOperand &MO = MI.getOperand(1);
7880 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7882 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7883 /*IsImplicit=*/true));
7884 }
7885}
7886
7888 bool Enable, SDValue Chain,
7889 SDValue InGlue,
7890 unsigned Condition,
7891 SDValue PStateSM) const {
7894 FuncInfo->setHasStreamingModeChanges(true);
7895
7896 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7897 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7898 SDValue MSROp =
7899 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7900 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7901 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7902 if (Condition != AArch64SME::Always) {
7903 assert(PStateSM && "PStateSM should be defined");
7904 Ops.push_back(PStateSM);
7905 }
7906 Ops.push_back(RegMask);
7907
7908 if (InGlue)
7909 Ops.push_back(InGlue);
7910
7911 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7912 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7913}
7914
7915static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7916 const SMEAttrs &CalleeAttrs) {
7917 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7918 CallerAttrs.hasStreamingBody())
7919 return AArch64SME::Always;
7920 if (CalleeAttrs.hasNonStreamingInterface())
7922 if (CalleeAttrs.hasStreamingInterface())
7924
7925 llvm_unreachable("Unsupported attributes");
7926}
7927
7928/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7929/// and add input and output parameter nodes.
7930SDValue
7931AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7932 SmallVectorImpl<SDValue> &InVals) const {
7933 SelectionDAG &DAG = CLI.DAG;
7934 SDLoc &DL = CLI.DL;
7935 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7936 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7938 SDValue Chain = CLI.Chain;
7939 SDValue Callee = CLI.Callee;
7940 bool &IsTailCall = CLI.IsTailCall;
7941 CallingConv::ID &CallConv = CLI.CallConv;
7942 bool IsVarArg = CLI.IsVarArg;
7943
7946 bool IsThisReturn = false;
7947
7949 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7950 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7951 bool IsSibCall = false;
7952 bool GuardWithBTI = false;
7953
7954 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7955 !Subtarget->noBTIAtReturnTwice()) {
7956 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7957 }
7958
7959 // Analyze operands of the call, assigning locations to each operand.
7961 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7962
7963 if (IsVarArg) {
7964 unsigned NumArgs = Outs.size();
7965
7966 for (unsigned i = 0; i != NumArgs; ++i) {
7967 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7968 report_fatal_error("Passing SVE types to variadic functions is "
7969 "currently not supported");
7970 }
7971 }
7972
7973 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7974
7975 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7976 // Assign locations to each value returned by this call.
7978 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7979 *DAG.getContext());
7980 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7981
7982 // Check callee args/returns for SVE registers and set calling convention
7983 // accordingly.
7984 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7985 auto HasSVERegLoc = [](CCValAssign &Loc) {
7986 if (!Loc.isRegLoc())
7987 return false;
7988 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7989 AArch64::PPRRegClass.contains(Loc.getLocReg());
7990 };
7991 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7993 }
7994
7995 if (IsTailCall) {
7996 // Check if it's really possible to do a tail call.
7997 IsTailCall = isEligibleForTailCallOptimization(CLI);
7998
7999 // A sibling call is one where we're under the usual C ABI and not planning
8000 // to change that but can still do a tail call:
8001 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8002 CallConv != CallingConv::SwiftTail)
8003 IsSibCall = true;
8004
8005 if (IsTailCall)
8006 ++NumTailCalls;
8007 }
8008
8009 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8010 report_fatal_error("failed to perform tail call elimination on a call "
8011 "site marked musttail");
8012
8013 // Get a count of how many bytes are to be pushed on the stack.
8014 unsigned NumBytes = CCInfo.getStackSize();
8015
8016 if (IsSibCall) {
8017 // Since we're not changing the ABI to make this a tail call, the memory
8018 // operands are already available in the caller's incoming argument space.
8019 NumBytes = 0;
8020 }
8021
8022 // FPDiff is the byte offset of the call's argument area from the callee's.
8023 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8024 // by this amount for a tail call. In a sibling call it must be 0 because the
8025 // caller will deallocate the entire stack and the callee still expects its
8026 // arguments to begin at SP+0. Completely unused for non-tail calls.
8027 int FPDiff = 0;
8028
8029 if (IsTailCall && !IsSibCall) {
8030 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8031
8032 // Since callee will pop argument stack as a tail call, we must keep the
8033 // popped size 16-byte aligned.
8034 NumBytes = alignTo(NumBytes, 16);
8035
8036 // FPDiff will be negative if this tail call requires more space than we
8037 // would automatically have in our incoming argument space. Positive if we
8038 // can actually shrink the stack.
8039 FPDiff = NumReusableBytes - NumBytes;
8040
8041 // Update the required reserved area if this is the tail call requiring the
8042 // most argument stack space.
8043 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8044 FuncInfo->setTailCallReservedStack(-FPDiff);
8045
8046 // The stack pointer must be 16-byte aligned at all times it's used for a
8047 // memory operation, which in practice means at *all* times and in
8048 // particular across call boundaries. Therefore our own arguments started at
8049 // a 16-byte aligned SP and the delta applied for the tail call should
8050 // satisfy the same constraint.
8051 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8052 }
8053
8054 // Determine whether we need any streaming mode changes.
8055 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8056 if (CLI.CB)
8057 CalleeAttrs = SMEAttrs(*CLI.CB);
8058 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8059 CalleeAttrs = SMEAttrs(ES->getSymbol());
8060
8061 auto DescribeCallsite =
8063 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8064 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8065 R << ore::NV("Callee", ES->getSymbol());
8066 else if (CLI.CB && CLI.CB->getCalledFunction())
8067 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8068 else
8069 R << "unknown callee";
8070 R << "'";
8071 return R;
8072 };
8073
8074 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8075 if (RequiresLazySave) {
8076 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
8078 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
8080 SDValue NumZaSaveSlicesAddr =
8081 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8082 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8083 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8084 DAG.getConstant(1, DL, MVT::i32));
8085 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8086 MPI, MVT::i16);
8087 Chain = DAG.getNode(
8088 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8089 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8090 TPIDR2ObjAddr);
8092 ORE.emit([&]() {
8093 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8094 CLI.CB)
8095 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8096 &MF.getFunction());
8097 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8098 });
8099 }
8100
8101 SDValue PStateSM;
8102 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8103 if (RequiresSMChange) {
8104 if (CallerAttrs.hasStreamingInterfaceOrBody())
8105 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8106 else if (CallerAttrs.hasNonStreamingInterface())
8107 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8108 else
8109 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8111 ORE.emit([&]() {
8112 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8113 CLI.CB)
8114 : OptimizationRemarkAnalysis("sme", "SMETransition",
8115 &MF.getFunction());
8116 DescribeCallsite(R) << " requires a streaming mode transition";
8117 return R;
8118 });
8119 }
8120
8121 SDValue ZTFrameIdx;
8122 MachineFrameInfo &MFI = MF.getFrameInfo();
8123 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8124
8125 // If the caller has ZT0 state which will not be preserved by the callee,
8126 // spill ZT0 before the call.
8127 if (ShouldPreserveZT0) {
8128 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8129 ZTFrameIdx = DAG.getFrameIndex(
8130 ZTObj,
8132
8133 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8134 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8135 }
8136
8137 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8138 // PSTATE.ZA before the call if there is no lazy-save active.
8139 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8140 assert((!DisableZA || !RequiresLazySave) &&
8141 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8142
8143 if (DisableZA)
8144 Chain = DAG.getNode(
8145 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8146 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8147 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8148
8149 // Adjust the stack pointer for the new arguments...
8150 // These operations are automatically eliminated by the prolog/epilog pass
8151 if (!IsSibCall)
8152 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8153
8154 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8156
8158 SmallSet<unsigned, 8> RegsUsed;
8159 SmallVector<SDValue, 8> MemOpChains;
8160 auto PtrVT = getPointerTy(DAG.getDataLayout());
8161
8162 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8163 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8164 for (const auto &F : Forwards) {
8165 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8166 RegsToPass.emplace_back(F.PReg, Val);
8167 }
8168 }
8169
8170 // Walk the register/memloc assignments, inserting copies/loads.
8171 unsigned ExtraArgLocs = 0;
8172 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8173 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8174 SDValue Arg = OutVals[i];
8175 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8176
8177 // Promote the value if needed.
8178 switch (VA.getLocInfo()) {
8179 default:
8180 llvm_unreachable("Unknown loc info!");
8181 case CCValAssign::Full:
8182 break;
8183 case CCValAssign::SExt:
8184 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8185 break;
8186 case CCValAssign::ZExt:
8187 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8188 break;
8189 case CCValAssign::AExt:
8190 if (Outs[i].ArgVT == MVT::i1) {
8191 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8192 //
8193 // Check if we actually have to do this, because the value may
8194 // already be zero-extended.
8195 //
8196 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8197 // and rely on DAGCombiner to fold this, because the following
8198 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8199 //
8200 // (ext (zext x)) -> (zext x)
8201 //
8202 // This will give us (zext i32), which we cannot remove, so
8203 // try to check this beforehand.
8204 if (!checkZExtBool(Arg, DAG)) {
8205 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8206 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8207 }
8208 }
8209 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8210 break;
8212 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8213 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8214 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8215 DAG.getConstant(32, DL, VA.getLocVT()));
8216 break;
8217 case CCValAssign::BCvt:
8218 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8219 break;
8220 case CCValAssign::Trunc:
8221 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8222 break;
8223 case CCValAssign::FPExt:
8224 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8225 break;
8227 bool isScalable = VA.getValVT().isScalableVT();
8228 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8229 "Indirect arguments should be scalable on most subtargets");
8230
8231 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8232 uint64_t PartSize = StoreSize;
8233 unsigned NumParts = 1;
8234 if (Outs[i].Flags.isInConsecutiveRegs()) {
8235 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8236 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8237 ++NumParts;
8238 StoreSize *= NumParts;
8239 }
8240
8241 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8242 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8243 MachineFrameInfo &MFI = MF.getFrameInfo();
8244 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8245 if (isScalable)
8247
8251 SDValue SpillSlot = Ptr;
8252
8253 // Ensure we generate all stores for each tuple part, whilst updating the
8254 // pointer after each store correctly using vscale.
8255 while (NumParts) {
8256 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8257 MemOpChains.push_back(Store);
8258
8259 NumParts--;
8260 if (NumParts > 0) {
8261 SDValue BytesIncrement;
8262 if (isScalable) {
8263 BytesIncrement = DAG.getVScale(
8264 DL, Ptr.getValueType(),
8265 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8266 } else {
8267 BytesIncrement = DAG.getConstant(
8268 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8269 Ptr.getValueType());
8270 }
8272 Flags.setNoUnsignedWrap(true);
8273
8274 MPI = MachinePointerInfo(MPI.getAddrSpace());
8275 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8276 BytesIncrement, Flags);
8277 ExtraArgLocs++;
8278 i++;
8279 }
8280 }
8281
8282 Arg = SpillSlot;
8283 break;
8284 }
8285
8286 if (VA.isRegLoc()) {
8287 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8288 Outs[0].VT == MVT::i64) {
8289 assert(VA.getLocVT() == MVT::i64 &&
8290 "unexpected calling convention register assignment");
8291 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8292 "unexpected use of 'returned'");
8293 IsThisReturn = true;
8294 }
8295 if (RegsUsed.count(VA.getLocReg())) {
8296 // If this register has already been used then we're trying to pack
8297 // parts of an [N x i32] into an X-register. The extension type will
8298 // take care of putting the two halves in the right place but we have to
8299 // combine them.
8300 SDValue &Bits =
8301 llvm::find_if(RegsToPass,
8302 [=](const std::pair<unsigned, SDValue> &Elt) {
8303 return Elt.first == VA.getLocReg();
8304 })
8305 ->second;
8306 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8307 // Call site info is used for function's parameter entry value
8308 // tracking. For now we track only simple cases when parameter
8309 // is transferred through whole register.
8311 [&VA](MachineFunction::ArgRegPair ArgReg) {
8312 return ArgReg.Reg == VA.getLocReg();
8313 });
8314 } else {
8315 // Add an extra level of indirection for streaming mode changes by
8316 // using a pseudo copy node that cannot be rematerialised between a
8317 // smstart/smstop and the call by the simple register coalescer.
8318 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8320 Arg.getValueType(), Arg);
8321 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8322 RegsUsed.insert(VA.getLocReg());
8323 const TargetOptions &Options = DAG.getTarget().Options;
8324 if (Options.EmitCallSiteInfo)
8325 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8326 }
8327 } else {
8328 assert(VA.isMemLoc());
8329
8330 SDValue DstAddr;
8331 MachinePointerInfo DstInfo;
8332
8333 // FIXME: This works on big-endian for composite byvals, which are the
8334 // common case. It should also work for fundamental types too.
8335 uint32_t BEAlign = 0;
8336 unsigned OpSize;
8337 if (VA.getLocInfo() == CCValAssign::Indirect ||
8339 OpSize = VA.getLocVT().getFixedSizeInBits();
8340 else
8341 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8342 : VA.getValVT().getSizeInBits();
8343 OpSize = (OpSize + 7) / 8;
8344 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8345 !Flags.isInConsecutiveRegs()) {
8346 if (OpSize < 8)
8347 BEAlign = 8 - OpSize;
8348 }
8349 unsigned LocMemOffset = VA.getLocMemOffset();
8350 int32_t Offset = LocMemOffset + BEAlign;
8351 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8352 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8353
8354 if (IsTailCall) {
8355 Offset = Offset + FPDiff;
8356 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8357
8358 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8359 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8360
8361 // Make sure any stack arguments overlapping with where we're storing
8362 // are loaded before this eventual operation. Otherwise they'll be
8363 // clobbered.
8364 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8365 } else {
8366 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8367
8368 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8369 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8370 }
8371
8372 if (Outs[i].Flags.isByVal()) {
8373 SDValue SizeNode =
8374 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8375 SDValue Cpy = DAG.getMemcpy(
8376 Chain, DL, DstAddr, Arg, SizeNode,
8377 Outs[i].Flags.getNonZeroByValAlign(),
8378 /*isVol = */ false, /*AlwaysInline = */ false,
8379 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8380
8381 MemOpChains.push_back(Cpy);
8382 } else {
8383 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8384 // promoted to a legal register type i32, we should truncate Arg back to
8385 // i1/i8/i16.
8386 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8387 VA.getValVT() == MVT::i16)
8388 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8389
8390 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8391 MemOpChains.push_back(Store);
8392 }
8393 }
8394 }
8395
8396 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8397 SDValue ParamPtr = StackPtr;
8398 if (IsTailCall) {
8399 // Create a dummy object at the top of the stack that can be used to get
8400 // the SP after the epilogue
8401 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8402 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8403 }
8404
8405 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8406 // describing the argument list. x4 contains the address of the
8407 // first stack parameter. x5 contains the size in bytes of all parameters
8408 // passed on the stack.
8409 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8410 RegsToPass.emplace_back(AArch64::X5,
8411 DAG.getConstant(NumBytes, DL, MVT::i64));
8412 }
8413
8414 if (!MemOpChains.empty())
8415 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8416
8417 SDValue InGlue;
8418 if (RequiresSMChange) {
8419 SDValue NewChain = changeStreamingMode(
8420 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8421 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8422 Chain = NewChain.getValue(0);
8423 InGlue = NewChain.getValue(1);
8424 }
8425
8426 // Build a sequence of copy-to-reg nodes chained together with token chain
8427 // and flag operands which copy the outgoing args into the appropriate regs.
8428 for (auto &RegToPass : RegsToPass) {
8429 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8430 RegToPass.second, InGlue);
8431 InGlue = Chain.getValue(1);
8432 }
8433
8434 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8435 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8436 // node so that legalize doesn't hack it.
8437 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8438 auto GV = G->getGlobal();
8439 unsigned OpFlags =
8441 if (OpFlags & AArch64II::MO_GOT) {
8442 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8443 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8444 } else {
8445 const GlobalValue *GV = G->getGlobal();
8446 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8447 }
8448 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8449 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8450 Subtarget->isTargetMachO()) ||
8452 const char *Sym = S->getSymbol();
8453 if (UseGot) {
8455 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8456 } else {
8457 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8458 }
8459 }
8460
8461 // We don't usually want to end the call-sequence here because we would tidy
8462 // the frame up *after* the call, however in the ABI-changing tail-call case
8463 // we've carefully laid out the parameters so that when sp is reset they'll be
8464 // in the correct location.
8465 if (IsTailCall && !IsSibCall) {
8466 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8467 InGlue = Chain.getValue(1);
8468 }
8469
8470 std::vector<SDValue> Ops;
8471 Ops.push_back(Chain);
8472 Ops.push_back(Callee);
8473
8474 if (IsTailCall) {
8475 // Each tail call may have to adjust the stack by a different amount, so
8476 // this information must travel along with the operation for eventual
8477 // consumption by emitEpilogue.
8478 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8479 }
8480
8481 // Add argument registers to the end of the list so that they are known live
8482 // into the call.
8483 for (auto &RegToPass : RegsToPass)
8484 Ops.push_back(DAG.getRegister(RegToPass.first,
8485 RegToPass.second.getValueType()));
8486
8487 // Add a register mask operand representing the call-preserved registers.
8488 const uint32_t *Mask;
8489 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8490 if (IsThisReturn) {
8491 // For 'this' returns, use the X0-preserving mask if applicable
8492 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8493 if (!Mask) {
8494 IsThisReturn = false;
8495 Mask = TRI->getCallPreservedMask(MF, CallConv);
8496 }
8497 } else
8498 Mask = TRI->getCallPreservedMask(MF, CallConv);
8499
8500 if (Subtarget->hasCustomCallingConv())
8501 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8502
8503 if (TRI->isAnyArgRegReserved(MF))
8504 TRI->emitReservedArgRegCallError(MF);
8505
8506 assert(Mask && "Missing call preserved mask for calling convention");
8507 Ops.push_back(DAG.getRegisterMask(Mask));
8508
8509 if (InGlue.getNode())
8510 Ops.push_back(InGlue);
8511
8512 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8513
8514 // If we're doing a tall call, use a TC_RETURN here rather than an
8515 // actual call instruction.
8516 if (IsTailCall) {
8518 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8519
8520 if (IsCFICall)
8521 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8522
8523 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8524 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8525 return Ret;
8526 }
8527
8528 unsigned CallOpc = AArch64ISD::CALL;
8529 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8530 // be expanded to the call, directly followed by a special marker sequence and
8531 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8532 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8533 assert(!IsTailCall &&
8534 "tail calls cannot be marked with clang.arc.attachedcall");
8535 CallOpc = AArch64ISD::CALL_RVMARKER;
8536
8537 // Add a target global address for the retainRV/claimRV runtime function
8538 // just before the call target.
8539 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8540 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8541 Ops.insert(Ops.begin() + 1, GA);
8542 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8544 } else if (GuardWithBTI) {
8545 CallOpc = AArch64ISD::CALL_BTI;
8546 }
8547
8548 // Returns a chain and a flag for retval copy to use.
8549 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8550
8551 if (IsCFICall)
8552 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8553
8554 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8555 InGlue = Chain.getValue(1);
8556 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8557
8558 uint64_t CalleePopBytes =
8559 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8560
8561 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8562 InGlue = Chain.getValue(1);
8563
8564 // Handle result values, copying them out of physregs into vregs that we
8565 // return.
8566 SDValue Result = LowerCallResult(
8567 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8568 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8569
8570 if (!Ins.empty())
8571 InGlue = Result.getValue(Result->getNumValues() - 1);
8572
8573 if (RequiresSMChange) {
8574 assert(PStateSM && "Expected a PStateSM to be set");
8576 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8577 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8578 }
8579
8580 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8581 // Unconditionally resume ZA.
8582 Result = DAG.getNode(
8583 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8584 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8585 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8586
8587 if (ShouldPreserveZT0)
8588 Result =
8589 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8590 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8591
8592 if (RequiresLazySave) {
8593 // Conditionally restore the lazy save using a pseudo node.
8594 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8595 SDValue RegMask = DAG.getRegisterMask(
8596 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8597 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8598 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8599 SDValue TPIDR2_EL0 = DAG.getNode(
8600 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8601 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8602
8603 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8604 // RESTORE_ZA pseudo.
8605 SDValue Glue;
8606 SDValue TPIDR2Block = DAG.getFrameIndex(
8608 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8609 Result =
8610 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8611 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8612 RestoreRoutine, RegMask, Result.getValue(1)});
8613
8614 // Finally reset the TPIDR2_EL0 register to 0.
8615 Result = DAG.getNode(
8616 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8617 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8618 DAG.getConstant(0, DL, MVT::i64));
8619 }
8620
8621 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8622 for (unsigned I = 0; I < InVals.size(); ++I) {
8623 // The smstart/smstop is chained as part of the call, but when the
8624 // resulting chain is discarded (which happens when the call is not part
8625 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8626 // smstart/smstop is chained to the result value. We can do that by doing
8627 // a vreg -> vreg copy.
8629 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8630 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8631 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8632 InVals[I].getValueType());
8633 }
8634 }
8635
8636 return Result;
8637}
8638
8639bool AArch64TargetLowering::CanLowerReturn(
8640 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8641 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8642 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8644 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8645 return CCInfo.CheckReturn(Outs, RetCC);
8646}
8647
8648SDValue
8649AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8650 bool isVarArg,
8652 const SmallVectorImpl<SDValue> &OutVals,
8653 const SDLoc &DL, SelectionDAG &DAG) const {
8654 auto &MF = DAG.getMachineFunction();
8655 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8656
8657 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8659 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8660 CCInfo.AnalyzeReturn(Outs, RetCC);
8661
8662 // Copy the result values into the output registers.
8663 SDValue Glue;
8665 SmallSet<unsigned, 4> RegsUsed;
8666 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8667 ++i, ++realRVLocIdx) {
8668 CCValAssign &VA = RVLocs[i];
8669 assert(VA.isRegLoc() && "Can only return in registers!");
8670 SDValue Arg = OutVals[realRVLocIdx];
8671
8672 switch (VA.getLocInfo()) {
8673 default:
8674 llvm_unreachable("Unknown loc info!");
8675 case CCValAssign::Full:
8676 if (Outs[i].ArgVT == MVT::i1) {
8677 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8678 // value. This is strictly redundant on Darwin (which uses "zeroext
8679 // i1"), but will be optimised out before ISel.
8680 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8681 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8682 }
8683 break;
8684 case CCValAssign::BCvt:
8685 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8686 break;
8687 case CCValAssign::AExt:
8688 case CCValAssign::ZExt:
8689 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8690 break;
8692 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8693 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8694 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8695 DAG.getConstant(32, DL, VA.getLocVT()));
8696 break;
8697 }
8698
8699 if (RegsUsed.count(VA.getLocReg())) {
8700 SDValue &Bits =
8701 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8702 return Elt.first == VA.getLocReg();
8703 })->second;
8704 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8705 } else {
8706 RetVals.emplace_back(VA.getLocReg(), Arg);
8707 RegsUsed.insert(VA.getLocReg());
8708 }
8709 }
8710
8711 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8712
8713 // Emit SMSTOP before returning from a locally streaming function
8714 SMEAttrs FuncAttrs(MF.getFunction());
8715 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8716 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8717 Register Reg = FuncInfo->getPStateSMReg();
8718 assert(Reg.isValid() && "PStateSM Register is invalid");
8719 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8720 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8721 /*Glue*/ SDValue(),
8723 } else
8724 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8725 /*Glue*/ SDValue(), AArch64SME::Always);
8726 Glue = Chain.getValue(1);
8727 }
8728
8729 SmallVector<SDValue, 4> RetOps(1, Chain);
8730 for (auto &RetVal : RetVals) {
8731 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8732 isPassedInFPR(RetVal.second.getValueType()))
8733 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8734 RetVal.second.getValueType(), RetVal.second);
8735 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8736 Glue = Chain.getValue(1);
8737 RetOps.push_back(
8738 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8739 }
8740
8741 // Windows AArch64 ABIs require that for returning structs by value we copy
8742 // the sret argument into X0 for the return.
8743 // We saved the argument into a virtual register in the entry block,
8744 // so now we copy the value out and into X0.
8745 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8746 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8748
8749 unsigned RetValReg = AArch64::X0;
8750 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8751 RetValReg = AArch64::X8;
8752 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8753 Glue = Chain.getValue(1);
8754
8755 RetOps.push_back(
8756 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8757 }
8758
8759 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8760 if (I) {
8761 for (; *I; ++I) {
8762 if (AArch64::GPR64RegClass.contains(*I))
8763 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8764 else if (AArch64::FPR64RegClass.contains(*I))
8765 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8766 else
8767 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8768 }
8769 }
8770
8771 RetOps[0] = Chain; // Update chain.
8772
8773 // Add the glue if we have it.
8774 if (Glue.getNode())
8775 RetOps.push_back(Glue);
8776
8777 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8778 // ARM64EC entry thunks use a special return sequence: instead of a regular
8779 // "ret" instruction, they need to explicitly call the emulator.
8780 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8781 SDValue Arm64ECRetDest =
8782 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8783 Arm64ECRetDest =
8784 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8785 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8787 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8788 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8789 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8790 }
8791
8792 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8793}
8794
8795//===----------------------------------------------------------------------===//
8796// Other Lowering Code
8797//===----------------------------------------------------------------------===//
8798
8799SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8800 SelectionDAG &DAG,
8801 unsigned Flag) const {
8802 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8803 N->getOffset(), Flag);
8804}
8805
8806SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8807 SelectionDAG &DAG,
8808 unsigned Flag) const {
8809 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8810}
8811
8812SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8813 SelectionDAG &DAG,
8814 unsigned Flag) const {
8815 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8816 N->getOffset(), Flag);
8817}
8818
8819SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8820 SelectionDAG &DAG,
8821 unsigned Flag) const {
8822 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8823}
8824
8825SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8826 SelectionDAG &DAG,
8827 unsigned Flag) const {
8828 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8829}
8830
8831// (loadGOT sym)
8832template <class NodeTy>
8833SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8834 unsigned Flags) const {
8835 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8836 SDLoc DL(N);
8837 EVT Ty = getPointerTy(DAG.getDataLayout());
8838 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8839 // FIXME: Once remat is capable of dealing with instructions with register
8840 // operands, expand this into two nodes instead of using a wrapper node.
8841 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8842}
8843
8844// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8845template <class NodeTy>
8846SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8847 unsigned Flags) const {
8848 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8849 SDLoc DL(N);
8850 EVT Ty = getPointerTy(DAG.getDataLayout());
8851 const unsigned char MO_NC = AArch64II::MO_NC;
8852 return DAG.getNode(
8854 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8855 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8856 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8857 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8858}
8859
8860// (addlow (adrp %hi(sym)) %lo(sym))
8861template <class NodeTy>
8862SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8863 unsigned Flags) const {
8864 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8865 SDLoc DL(N);
8866 EVT Ty = getPointerTy(DAG.getDataLayout());
8867 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8868 SDValue Lo = getTargetNode(N, Ty, DAG,
8871 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8872}
8873
8874// (adr sym)
8875template <class NodeTy>
8876SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8877 unsigned Flags) const {
8878 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8879 SDLoc DL(N);
8880 EVT Ty = getPointerTy(DAG.getDataLayout());
8881 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8882 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8883}
8884
8885SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8886 SelectionDAG &DAG) const {
8887 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8888 const GlobalValue *GV = GN->getGlobal();
8889 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8890
8891 if (OpFlags != AArch64II::MO_NO_FLAG)
8892 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8893 "unexpected offset in global node");
8894
8895 // This also catches the large code model case for Darwin, and tiny code
8896 // model with got relocations.
8897 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8898 return getGOT(GN, DAG, OpFlags);
8899 }
8900
8904 Result = getAddrLarge(GN, DAG, OpFlags);
8905 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8906 Result = getAddrTiny(GN, DAG, OpFlags);
8907 } else {
8908 Result = getAddr(GN, DAG, OpFlags);
8909 }
8910 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8911 SDLoc DL(GN);
8913 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8915 return Result;
8916}
8917
8918/// Convert a TLS address reference into the correct sequence of loads
8919/// and calls to compute the variable's address (for Darwin, currently) and
8920/// return an SDValue containing the final node.
8921
8922/// Darwin only has one TLS scheme which must be capable of dealing with the
8923/// fully general situation, in the worst case. This means:
8924/// + "extern __thread" declaration.
8925/// + Defined in a possibly unknown dynamic library.
8926///
8927/// The general system is that each __thread variable has a [3 x i64] descriptor
8928/// which contains information used by the runtime to calculate the address. The
8929/// only part of this the compiler needs to know about is the first xword, which
8930/// contains a function pointer that must be called with the address of the
8931/// entire descriptor in "x0".
8932///
8933/// Since this descriptor may be in a different unit, in general even the
8934/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8935/// is:
8936/// adrp x0, _var@TLVPPAGE
8937/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8938/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8939/// ; the function pointer
8940/// blr x1 ; Uses descriptor address in x0
8941/// ; Address of _var is now in x0.
8942///
8943/// If the address of _var's descriptor *is* known to the linker, then it can
8944/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8945/// a slight efficiency gain.
8946SDValue
8947AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8948 SelectionDAG &DAG) const {
8949 assert(Subtarget->isTargetDarwin() &&
8950 "This function expects a Darwin target");
8951
8952 SDLoc DL(Op);
8953 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8954 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8955 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8956
8957 SDValue TLVPAddr =
8958 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8959 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8960
8961 // The first entry in the descriptor is a function pointer that we must call
8962 // to obtain the address of the variable.
8963 SDValue Chain = DAG.getEntryNode();
8964 SDValue FuncTLVGet = DAG.getLoad(
8965 PtrMemVT, DL, Chain, DescAddr,
8967 Align(PtrMemVT.getSizeInBits() / 8),
8969 Chain = FuncTLVGet.getValue(1);
8970
8971 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8972 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8973
8975 MFI.setAdjustsStack(true);
8976
8977 // TLS calls preserve all registers except those that absolutely must be
8978 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8979 // silly).
8980 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8981 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8982 if (Subtarget->hasCustomCallingConv())
8983 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8984
8985 // Finally, we can make the call. This is just a degenerate version of a
8986 // normal AArch64 call node: x0 takes the address of the descriptor, and
8987 // returns the address of the variable in this thread.
8988 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8989 Chain =
8990 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8991 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8992 DAG.getRegisterMask(Mask), Chain.getValue(1));
8993 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8994}
8995
8996/// Convert a thread-local variable reference into a sequence of instructions to
8997/// compute the variable's address for the local exec TLS model of ELF targets.
8998/// The sequence depends on the maximum TLS area size.
8999SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9000 SDValue ThreadBase,
9001 const SDLoc &DL,
9002 SelectionDAG &DAG) const {
9003 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9004 SDValue TPOff, Addr;
9005
9006 switch (DAG.getTarget().Options.TLSSize) {
9007 default:
9008 llvm_unreachable("Unexpected TLS size");
9009
9010 case 12: {
9011 // mrs x0, TPIDR_EL0
9012 // add x0, x0, :tprel_lo12:a
9014 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
9015 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9016 Var,
9017 DAG.getTargetConstant(0, DL, MVT::i32)),
9018 0);
9019 }
9020
9021 case 24: {
9022 // mrs x0, TPIDR_EL0
9023 // add x0, x0, :tprel_hi12:a
9024 // add x0, x0, :tprel_lo12_nc:a
9025 SDValue HiVar = DAG.getTargetGlobalAddress(
9026 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9027 SDValue LoVar = DAG.getTargetGlobalAddress(
9028 GV, DL, PtrVT, 0,
9030 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9031 HiVar,
9032 DAG.getTargetConstant(0, DL, MVT::i32)),
9033 0);
9034 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
9035 LoVar,
9036 DAG.getTargetConstant(0, DL, MVT::i32)),
9037 0);
9038 }
9039
9040 case 32: {
9041 // mrs x1, TPIDR_EL0
9042 // movz x0, #:tprel_g1:a
9043 // movk x0, #:tprel_g0_nc:a
9044 // add x0, x1, x0
9045 SDValue HiVar = DAG.getTargetGlobalAddress(
9046 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
9047 SDValue LoVar = DAG.getTargetGlobalAddress(
9048 GV, DL, PtrVT, 0,
9050 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9051 DAG.getTargetConstant(16, DL, MVT::i32)),
9052 0);
9053 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9054 DAG.getTargetConstant(0, DL, MVT::i32)),
9055 0);
9056 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9057 }
9058
9059 case 48: {
9060 // mrs x1, TPIDR_EL0
9061 // movz x0, #:tprel_g2:a
9062 // movk x0, #:tprel_g1_nc:a
9063 // movk x0, #:tprel_g0_nc:a
9064 // add x0, x1, x0
9065 SDValue HiVar = DAG.getTargetGlobalAddress(
9066 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
9067 SDValue MiVar = DAG.getTargetGlobalAddress(
9068 GV, DL, PtrVT, 0,
9070 SDValue LoVar = DAG.getTargetGlobalAddress(
9071 GV, DL, PtrVT, 0,
9073 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9074 DAG.getTargetConstant(32, DL, MVT::i32)),
9075 0);
9076 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9077 DAG.getTargetConstant(16, DL, MVT::i32)),
9078 0);
9079 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9080 DAG.getTargetConstant(0, DL, MVT::i32)),
9081 0);
9082 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9083 }
9084 }
9085}
9086
9087/// When accessing thread-local variables under either the general-dynamic or
9088/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9089/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9090/// is a function pointer to carry out the resolution.
9091///
9092/// The sequence is:
9093/// adrp x0, :tlsdesc:var
9094/// ldr x1, [x0, #:tlsdesc_lo12:var]
9095/// add x0, x0, #:tlsdesc_lo12:var
9096/// .tlsdesccall var
9097/// blr x1
9098/// (TPIDR_EL0 offset now in x0)
9099///
9100/// The above sequence must be produced unscheduled, to enable the linker to
9101/// optimize/relax this sequence.
9102/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9103/// above sequence, and expanded really late in the compilation flow, to ensure
9104/// the sequence is produced as per above.
9105SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9106 const SDLoc &DL,
9107 SelectionDAG &DAG) const {
9108 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9109
9110 SDValue Chain = DAG.getEntryNode();
9111 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9112
9113 Chain =
9114 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9115 SDValue Glue = Chain.getValue(1);
9116
9117 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9118}
9119
9120SDValue
9121AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9122 SelectionDAG &DAG) const {
9123 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9124
9125 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9126
9128
9130 if (Model == TLSModel::LocalDynamic)
9132 }
9133
9135 Model != TLSModel::LocalExec)
9136 report_fatal_error("ELF TLS only supported in small memory model or "
9137 "in local exec TLS model");
9138 // Different choices can be made for the maximum size of the TLS area for a
9139 // module. For the small address model, the default TLS size is 16MiB and the
9140 // maximum TLS size is 4GiB.
9141 // FIXME: add tiny and large code model support for TLS access models other
9142 // than local exec. We currently generate the same code as small for tiny,
9143 // which may be larger than needed.
9144
9145 SDValue TPOff;
9146 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9147 SDLoc DL(Op);
9148 const GlobalValue *GV = GA->getGlobal();
9149
9150 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9151
9152 if (Model == TLSModel::LocalExec) {
9153 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9154 } else if (Model == TLSModel::InitialExec) {
9155 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9156 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9157 } else if (Model == TLSModel::LocalDynamic) {
9158 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9159 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9160 // the beginning of the module's TLS region, followed by a DTPREL offset
9161 // calculation.
9162
9163 // These accesses will need deduplicating if there's more than one.
9164 AArch64FunctionInfo *MFI =
9167
9168 // The call needs a relocation too for linker relaxation. It doesn't make
9169 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9170 // the address.
9171 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9173
9174 // Now we can calculate the offset from TPIDR_EL0 to this module's
9175 // thread-local area.
9176 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9177
9178 // Now use :dtprel_whatever: operations to calculate this variable's offset
9179 // in its thread-storage area.
9180 SDValue HiVar = DAG.getTargetGlobalAddress(
9181 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9182 SDValue LoVar = DAG.getTargetGlobalAddress(
9183 GV, DL, MVT::i64, 0,
9185
9186 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9187 DAG.getTargetConstant(0, DL, MVT::i32)),
9188 0);
9189 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9190 DAG.getTargetConstant(0, DL, MVT::i32)),
9191 0);
9192 } else if (Model == TLSModel::GeneralDynamic) {
9193 // The call needs a relocation too for linker relaxation. It doesn't make
9194 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9195 // the address.
9196 SDValue SymAddr =
9197 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9198
9199 // Finally we can make a call to calculate the offset from tpidr_el0.
9200 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9201 } else
9202 llvm_unreachable("Unsupported ELF TLS access model");
9203
9204 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9205}
9206
9207SDValue
9208AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9209 SelectionDAG &DAG) const {
9210 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9211
9212 SDValue Chain = DAG.getEntryNode();
9213 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9214 SDLoc DL(Op);
9215
9216 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9217
9218 // Load the ThreadLocalStoragePointer from the TEB
9219 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9220 SDValue TLSArray =
9221 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9222 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9223 Chain = TLSArray.getValue(1);
9224
9225 // Load the TLS index from the C runtime;
9226 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9227 // This also does the same as LOADgot, but using a generic i32 load,
9228 // while LOADgot only loads i64.
9229 SDValue TLSIndexHi =
9230 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9231 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9232 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9233 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9234 SDValue TLSIndex =
9235 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9236 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9237 Chain = TLSIndex.getValue(1);
9238
9239 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9240 // offset into the TLSArray.
9241 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9242 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9243 DAG.getConstant(3, DL, PtrVT));
9244 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9245 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9247 Chain = TLS.getValue(1);
9248
9249 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9250 const GlobalValue *GV = GA->getGlobal();
9251 SDValue TGAHi = DAG.getTargetGlobalAddress(
9252 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9253 SDValue TGALo = DAG.getTargetGlobalAddress(
9254 GV, DL, PtrVT, 0,
9256
9257 // Add the offset from the start of the .tls section (section base).
9258 SDValue Addr =
9259 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9260 DAG.getTargetConstant(0, DL, MVT::i32)),
9261 0);
9262 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9263 return Addr;
9264}
9265
9266SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9267 SelectionDAG &DAG) const {
9268 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9269 if (DAG.getTarget().useEmulatedTLS())
9270 return LowerToTLSEmulatedModel(GA, DAG);
9271
9272 if (Subtarget->isTargetDarwin())
9273 return LowerDarwinGlobalTLSAddress(Op, DAG);
9274 if (Subtarget->isTargetELF())
9275 return LowerELFGlobalTLSAddress(Op, DAG);
9276 if (Subtarget->isTargetWindows())
9277 return LowerWindowsGlobalTLSAddress(Op, DAG);
9278
9279 llvm_unreachable("Unexpected platform trying to use TLS");
9280}
9281
9282// Looks through \param Val to determine the bit that can be used to
9283// check the sign of the value. It returns the unextended value and
9284// the sign bit position.
9285std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9286 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9287 return {Val.getOperand(0),
9288 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9289 1};
9290
9291 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9292 return {Val.getOperand(0),
9293 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9294
9295 return {Val, Val.getValueSizeInBits() - 1};
9296}
9297
9298SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9299 SDValue Chain = Op.getOperand(0);
9300 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9301 SDValue LHS = Op.getOperand(2);
9302 SDValue RHS = Op.getOperand(3);
9303 SDValue Dest = Op.getOperand(4);
9304 SDLoc dl(Op);
9305
9307 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9308 // will not be produced, as they are conditional branch instructions that do
9309 // not set flags.
9310 bool ProduceNonFlagSettingCondBr =
9311 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9312
9313 // Handle f128 first, since lowering it will result in comparing the return
9314 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9315 // is expecting to deal with.
9316 if (LHS.getValueType() == MVT::f128) {
9317 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9318
9319 // If softenSetCCOperands returned a scalar, we need to compare the result
9320 // against zero to select between true and false values.
9321 if (!RHS.getNode()) {
9322 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9323 CC = ISD::SETNE;
9324 }
9325 }
9326
9327 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9328 // instruction.
9329 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9330 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9331 // Only lower legal XALUO ops.
9332 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9333 return SDValue();
9334
9335 // The actual operation with overflow check.
9337 SDValue Value, Overflow;
9338 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9339
9340 if (CC == ISD::SETNE)
9341 OFCC = getInvertedCondCode(OFCC);
9342 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9343
9344 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9345 Overflow);
9346 }
9347
9348 if (LHS.getValueType().isInteger()) {
9349 assert((LHS.getValueType() == RHS.getValueType()) &&
9350 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9351
9352 // If the RHS of the comparison is zero, we can potentially fold this
9353 // to a specialized branch.
9354 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9355 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9356 if (CC == ISD::SETEQ) {
9357 // See if we can use a TBZ to fold in an AND as well.
9358 // TBZ has a smaller branch displacement than CBZ. If the offset is
9359 // out of bounds, a late MI-layer pass rewrites branches.
9360 // 403.gcc is an example that hits this case.
9361 if (LHS.getOpcode() == ISD::AND &&
9362 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9363 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9364 SDValue Test = LHS.getOperand(0);
9365 uint64_t Mask = LHS.getConstantOperandVal(1);
9366 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9367 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9368 Dest);
9369 }
9370
9371 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9372 } else if (CC == ISD::SETNE) {
9373 // See if we can use a TBZ to fold in an AND as well.
9374 // TBZ has a smaller branch displacement than CBZ. If the offset is
9375 // out of bounds, a late MI-layer pass rewrites branches.
9376 // 403.gcc is an example that hits this case.
9377 if (LHS.getOpcode() == ISD::AND &&
9378 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9379 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9380 SDValue Test = LHS.getOperand(0);
9381 uint64_t Mask = LHS.getConstantOperandVal(1);
9382 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9383 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9384 Dest);
9385 }
9386
9387 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9388 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9389 // Don't combine AND since emitComparison converts the AND to an ANDS
9390 // (a.k.a. TST) and the test in the test bit and branch instruction
9391 // becomes redundant. This would also increase register pressure.
9392 uint64_t SignBitPos;
9393 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9394 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9395 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9396 }
9397 }
9398 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9399 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9400 // Don't combine AND since emitComparison converts the AND to an ANDS
9401 // (a.k.a. TST) and the test in the test bit and branch instruction
9402 // becomes redundant. This would also increase register pressure.
9403 uint64_t SignBitPos;
9404 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9405 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9406 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9407 }
9408
9409 SDValue CCVal;
9410 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9411 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9412 Cmp);
9413 }
9414
9415 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9416 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9417
9418 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9419 // clean. Some of them require two branches to implement.
9420 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9421 AArch64CC::CondCode CC1, CC2;
9422 changeFPCCToAArch64CC(CC, CC1, CC2);
9423 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9424 SDValue BR1 =
9425 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9426 if (CC2 != AArch64CC::AL) {
9427 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9428 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9429 Cmp);
9430 }
9431
9432 return BR1;
9433}
9434
9435SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9436 SelectionDAG &DAG) const {
9437 if (!Subtarget->hasNEON())
9438 return SDValue();
9439
9440 EVT VT = Op.getValueType();
9441 EVT IntVT = VT.changeTypeToInteger();
9442 SDLoc DL(Op);
9443
9444 SDValue In1 = Op.getOperand(0);
9445 SDValue In2 = Op.getOperand(1);
9446 EVT SrcVT = In2.getValueType();
9447
9448 if (!SrcVT.bitsEq(VT))
9449 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9450
9451 if (VT.isScalableVector())
9452 IntVT =
9454
9455 if (VT.isFixedLengthVector() &&
9456 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9457 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9458
9459 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9460 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9461
9462 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9463 return convertFromScalableVector(DAG, VT, Res);
9464 }
9465
9466 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9467 if (VT.isScalableVector())
9468 return getSVESafeBitCast(VT, Op, DAG);
9469
9470 return DAG.getBitcast(VT, Op);
9471 };
9472
9473 SDValue VecVal1, VecVal2;
9474 EVT VecVT;
9475 auto SetVecVal = [&](int Idx = -1) {
9476 if (!VT.isVector()) {
9477 VecVal1 =
9478 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9479 VecVal2 =
9480 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9481 } else {
9482 VecVal1 = BitCast(VecVT, In1, DAG);
9483 VecVal2 = BitCast(VecVT, In2, DAG);
9484 }
9485 };
9486 if (VT.isVector()) {
9487 VecVT = IntVT;
9488 SetVecVal();
9489 } else if (VT == MVT::f64) {
9490 VecVT = MVT::v2i64;
9491 SetVecVal(AArch64::dsub);
9492 } else if (VT == MVT::f32) {
9493 VecVT = MVT::v4i32;
9494 SetVecVal(AArch64::ssub);
9495 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9496 VecVT = MVT::v8i16;
9497 SetVecVal(AArch64::hsub);
9498 } else {
9499 llvm_unreachable("Invalid type for copysign!");
9500 }
9501
9502 unsigned BitWidth = In1.getScalarValueSizeInBits();
9503 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9504
9505 // We want to materialize a mask with every bit but the high bit set, but the
9506 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9507 // 64-bit elements. Instead, materialize all bits set and then negate that.
9508 if (VT == MVT::f64 || VT == MVT::v2f64) {
9509 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9510 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9511 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9512 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9513 }
9514
9515 SDValue BSP =
9516 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9517 if (VT == MVT::f16 || VT == MVT::bf16)
9518 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9519 if (VT == MVT::f32)
9520 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9521 if (VT == MVT::f64)
9522 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9523
9524 return BitCast(VT, BSP, DAG);
9525}
9526
9527SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9528 SelectionDAG &DAG) const {
9530 Attribute::NoImplicitFloat))
9531 return SDValue();
9532
9533 if (!Subtarget->hasNEON())
9534 return SDValue();
9535
9536 bool IsParity = Op.getOpcode() == ISD::PARITY;
9537 SDValue Val = Op.getOperand(0);
9538 SDLoc DL(Op);
9539 EVT VT = Op.getValueType();
9540
9541 // for i32, general parity function using EORs is more efficient compared to
9542 // using floating point
9543 if (VT == MVT::i32 && IsParity)
9544 return SDValue();
9545
9546 // If there is no CNT instruction available, GPR popcount can
9547 // be more efficiently lowered to the following sequence that uses
9548 // AdvSIMD registers/instructions as long as the copies to/from
9549 // the AdvSIMD registers are cheap.
9550 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9551 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9552 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9553 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9554 if (VT == MVT::i32 || VT == MVT::i64) {
9555 if (VT == MVT::i32)
9556 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9557 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9558
9559 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9560 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9561 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9562 DAG.getConstant(0, DL, MVT::i64));
9563
9564 if (IsParity)
9565 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9566 DAG.getConstant(1, DL, MVT::i32));
9567
9568 if (VT == MVT::i64)
9569 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9570 return UaddLV;
9571 } else if (VT == MVT::i128) {
9572 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9573
9574 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9575 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9576 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9577 DAG.getConstant(0, DL, MVT::i64));
9578
9579 if (IsParity)
9580 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9581 DAG.getConstant(1, DL, MVT::i32));
9582
9583 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9584 }
9585
9586 assert(!IsParity && "ISD::PARITY of vector types not supported");
9587
9588 if (VT.isScalableVector() ||
9590 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9591
9592 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9593 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9594 "Unexpected type for custom ctpop lowering");
9595
9596 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9597 Val = DAG.getBitcast(VT8Bit, Val);
9598 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9599
9600 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9601 unsigned EltSize = 8;
9602 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9603 while (EltSize != VT.getScalarSizeInBits()) {
9604 EltSize *= 2;
9605 NumElts /= 2;
9606 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9607 Val = DAG.getNode(
9608 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9609 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9610 }
9611
9612 return Val;
9613}
9614
9615SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9616 EVT VT = Op.getValueType();
9617 assert(VT.isScalableVector() ||
9619 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9620
9621 SDLoc DL(Op);
9622 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9623 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9624}
9625
9626SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9627 SelectionDAG &DAG) const {
9628
9629 EVT VT = Op.getValueType();
9630 SDLoc DL(Op);
9631 unsigned Opcode = Op.getOpcode();
9633 switch (Opcode) {
9634 default:
9635 llvm_unreachable("Wrong instruction");
9636 case ISD::SMAX:
9637 CC = ISD::SETGT;
9638 break;
9639 case ISD::SMIN:
9640 CC = ISD::SETLT;
9641 break;
9642 case ISD::UMAX:
9643 CC = ISD::SETUGT;
9644 break;
9645 case ISD::UMIN:
9646 CC = ISD::SETULT;
9647 break;
9648 }
9649
9650 if (VT.isScalableVector() ||
9652 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9653 switch (Opcode) {
9654 default:
9655 llvm_unreachable("Wrong instruction");
9656 case ISD::SMAX:
9657 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9658 case ISD::SMIN:
9659 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9660 case ISD::UMAX:
9661 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9662 case ISD::UMIN:
9663 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9664 }
9665 }
9666
9667 SDValue Op0 = Op.getOperand(0);
9668 SDValue Op1 = Op.getOperand(1);
9669 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9670 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9671}
9672
9673SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9674 SelectionDAG &DAG) const {
9675 EVT VT = Op.getValueType();
9676
9677 if (VT.isScalableVector() ||
9679 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9680 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9681
9682 SDLoc DL(Op);
9683 SDValue REVB;
9684 MVT VST;
9685
9686 switch (VT.getSimpleVT().SimpleTy) {
9687 default:
9688 llvm_unreachable("Invalid type for bitreverse!");
9689
9690 case MVT::v2i32: {
9691 VST = MVT::v8i8;
9692 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9693
9694 break;
9695 }
9696
9697 case MVT::v4i32: {
9698 VST = MVT::v16i8;
9699 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9700
9701 break;
9702 }
9703
9704 case MVT::v1i64: {
9705 VST = MVT::v8i8;
9706 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9707
9708 break;
9709 }
9710
9711 case MVT::v2i64: {
9712 VST = MVT::v16i8;
9713 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9714
9715 break;
9716 }
9717 }
9718
9719 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9720 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9721}
9722
9723// Check whether the continuous comparison sequence.
9724static bool
9725isOrXorChain(SDValue N, unsigned &Num,
9726 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9727 if (Num == MaxXors)
9728 return false;
9729
9730 // Skip the one-use zext
9731 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9732 N = N->getOperand(0);
9733
9734 // The leaf node must be XOR
9735 if (N->getOpcode() == ISD::XOR) {
9736 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9737 Num++;
9738 return true;
9739 }
9740
9741 // All the non-leaf nodes must be OR.
9742 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9743 return false;
9744
9745 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9746 isOrXorChain(N->getOperand(1), Num, WorkList))
9747 return true;
9748 return false;
9749}
9750
9751// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9753 SDValue LHS = N->getOperand(0);
9754 SDValue RHS = N->getOperand(1);
9755 SDLoc DL(N);
9756 EVT VT = N->getValueType(0);
9758
9759 // Only handle integer compares.
9760 if (N->getOpcode() != ISD::SETCC)
9761 return SDValue();
9762
9763 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9764 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9765 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9766 unsigned NumXors = 0;
9767 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9768 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9769 isOrXorChain(LHS, NumXors, WorkList)) {
9770 SDValue XOR0, XOR1;
9771 std::tie(XOR0, XOR1) = WorkList[0];
9772 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9773 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9774 for (unsigned I = 1; I < WorkList.size(); I++) {
9775 std::tie(XOR0, XOR1) = WorkList[I];
9776 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9777 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9778 }
9779
9780 // Exit early by inverting the condition, which help reduce indentations.
9781 return Cmp;
9782 }
9783
9784 return SDValue();
9785}
9786
9787SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9788
9789 if (Op.getValueType().isVector())
9790 return LowerVSETCC(Op, DAG);
9791
9792 bool IsStrict = Op->isStrictFPOpcode();
9793 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9794 unsigned OpNo = IsStrict ? 1 : 0;
9795 SDValue Chain;
9796 if (IsStrict)
9797 Chain = Op.getOperand(0);
9798 SDValue LHS = Op.getOperand(OpNo + 0);
9799 SDValue RHS = Op.getOperand(OpNo + 1);
9800 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9801 SDLoc dl(Op);
9802
9803 // We chose ZeroOrOneBooleanContents, so use zero and one.
9804 EVT VT = Op.getValueType();
9805 SDValue TVal = DAG.getConstant(1, dl, VT);
9806 SDValue FVal = DAG.getConstant(0, dl, VT);
9807
9808 // Handle f128 first, since one possible outcome is a normal integer
9809 // comparison which gets picked up by the next if statement.
9810 if (LHS.getValueType() == MVT::f128) {
9811 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9812 IsSignaling);
9813
9814 // If softenSetCCOperands returned a scalar, use it.
9815 if (!RHS.getNode()) {
9816 assert(LHS.getValueType() == Op.getValueType() &&
9817 "Unexpected setcc expansion!");
9818 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9819 }
9820 }
9821
9822 if (LHS.getValueType().isInteger()) {
9823 SDValue CCVal;
9825 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9826
9827 // Note that we inverted the condition above, so we reverse the order of
9828 // the true and false operands here. This will allow the setcc to be
9829 // matched to a single CSINC instruction.
9830 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9831 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9832 }
9833
9834 // Now we know we're dealing with FP values.
9835 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9836 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9837
9838 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9839 // and do the comparison.
9840 SDValue Cmp;
9841 if (IsStrict)
9842 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9843 else
9844 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9845
9846 AArch64CC::CondCode CC1, CC2;
9847 changeFPCCToAArch64CC(CC, CC1, CC2);
9848 SDValue Res;
9849 if (CC2 == AArch64CC::AL) {
9850 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9851 CC2);
9852 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9853
9854 // Note that we inverted the condition above, so we reverse the order of
9855 // the true and false operands here. This will allow the setcc to be
9856 // matched to a single CSINC instruction.
9857 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9858 } else {
9859 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9860 // totally clean. Some of them require two CSELs to implement. As is in
9861 // this case, we emit the first CSEL and then emit a second using the output
9862 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9863
9864 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9865 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9866 SDValue CS1 =
9867 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9868
9869 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9870 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9871 }
9872 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9873}
9874
9875SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9876 SelectionDAG &DAG) const {
9877
9878 SDValue LHS = Op.getOperand(0);
9879 SDValue RHS = Op.getOperand(1);
9880 EVT VT = LHS.getValueType();
9881 if (VT != MVT::i32 && VT != MVT::i64)
9882 return SDValue();
9883
9884 SDLoc DL(Op);
9885 SDValue Carry = Op.getOperand(2);
9886 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9887 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9888 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9889 LHS, RHS, InvCarry);
9890
9891 EVT OpVT = Op.getValueType();
9892 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9893 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9894
9895 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9897 SDValue CCVal =
9898 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9899 // Inputs are swapped because the condition is inverted. This will allow
9900 // matching with a single CSINC instruction.
9901 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9902 Cmp.getValue(1));
9903}
9904
9905SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9906 SDValue RHS, SDValue TVal,
9907 SDValue FVal, const SDLoc &dl,
9908 SelectionDAG &DAG) const {
9909 // Handle f128 first, because it will result in a comparison of some RTLIB
9910 // call result against zero.
9911 if (LHS.getValueType() == MVT::f128) {
9912 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9913
9914 // If softenSetCCOperands returned a scalar, we need to compare the result
9915 // against zero to select between true and false values.
9916 if (!RHS.getNode()) {
9917 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9918 CC = ISD::SETNE;
9919 }
9920 }
9921
9922 // Also handle f16, for which we need to do a f32 comparison.
9923 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9924 LHS.getValueType() == MVT::bf16) {
9925 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9926 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9927 }
9928
9929 // Next, handle integers.
9930 if (LHS.getValueType().isInteger()) {
9931 assert((LHS.getValueType() == RHS.getValueType()) &&
9932 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9933
9934 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9935 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9936 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9937 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9938 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9939 // supported types.
9940 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9941 CTVal->isOne() && CFVal->isAllOnes() &&
9942 LHS.getValueType() == TVal.getValueType()) {
9943 EVT VT = LHS.getValueType();
9944 SDValue Shift =
9945 DAG.getNode(ISD::SRA, dl, VT, LHS,
9946 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9947 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9948 }
9949
9950 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9951 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9952 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9953 // Both require less instructions than compare and conditional select.
9954 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9955 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9956 LHS.getValueType() == RHS.getValueType()) {
9957 EVT VT = LHS.getValueType();
9958 SDValue Shift =
9959 DAG.getNode(ISD::SRA, dl, VT, LHS,
9960 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9961
9962 if (CC == ISD::SETGT)
9963 Shift = DAG.getNOT(dl, Shift, VT);
9964
9965 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9966 }
9967
9968 unsigned Opcode = AArch64ISD::CSEL;
9969
9970 // If both the TVal and the FVal are constants, see if we can swap them in
9971 // order to for a CSINV or CSINC out of them.
9972 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9973 std::swap(TVal, FVal);
9974 std::swap(CTVal, CFVal);
9975 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9976 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9977 std::swap(TVal, FVal);
9978 std::swap(CTVal, CFVal);
9979 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9980 } else if (TVal.getOpcode() == ISD::XOR) {
9981 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9982 // with a CSINV rather than a CSEL.
9983 if (isAllOnesConstant(TVal.getOperand(1))) {
9984 std::swap(TVal, FVal);
9985 std::swap(CTVal, CFVal);
9986 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9987 }
9988 } else if (TVal.getOpcode() == ISD::SUB) {
9989 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9990 // that we can match with a CSNEG rather than a CSEL.
9991 if (isNullConstant(TVal.getOperand(0))) {
9992 std::swap(TVal, FVal);
9993 std::swap(CTVal, CFVal);
9994 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9995 }
9996 } else if (CTVal && CFVal) {
9997 const int64_t TrueVal = CTVal->getSExtValue();
9998 const int64_t FalseVal = CFVal->getSExtValue();
9999 bool Swap = false;
10000
10001 // If both TVal and FVal are constants, see if FVal is the
10002 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
10003 // instead of a CSEL in that case.
10004 if (TrueVal == ~FalseVal) {
10005 Opcode = AArch64ISD::CSINV;
10006 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
10007 TrueVal == -FalseVal) {
10008 Opcode = AArch64ISD::CSNEG;
10009 } else if (TVal.getValueType() == MVT::i32) {
10010 // If our operands are only 32-bit wide, make sure we use 32-bit
10011 // arithmetic for the check whether we can use CSINC. This ensures that
10012 // the addition in the check will wrap around properly in case there is
10013 // an overflow (which would not be the case if we do the check with
10014 // 64-bit arithmetic).
10015 const uint32_t TrueVal32 = CTVal->getZExtValue();
10016 const uint32_t FalseVal32 = CFVal->getZExtValue();
10017
10018 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
10019 Opcode = AArch64ISD::CSINC;
10020
10021 if (TrueVal32 > FalseVal32) {
10022 Swap = true;
10023 }
10024 }
10025 } else {
10026 // 64-bit check whether we can use CSINC.
10027 const uint64_t TrueVal64 = TrueVal;
10028 const uint64_t FalseVal64 = FalseVal;
10029
10030 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
10031 Opcode = AArch64ISD::CSINC;
10032
10033 if (TrueVal > FalseVal) {
10034 Swap = true;
10035 }
10036 }
10037 }
10038
10039 // Swap TVal and FVal if necessary.
10040 if (Swap) {
10041 std::swap(TVal, FVal);
10042 std::swap(CTVal, CFVal);
10043 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10044 }
10045
10046 if (Opcode != AArch64ISD::CSEL) {
10047 // Drop FVal since we can get its value by simply inverting/negating
10048 // TVal.
10049 FVal = TVal;
10050 }
10051 }
10052
10053 // Avoid materializing a constant when possible by reusing a known value in
10054 // a register. However, don't perform this optimization if the known value
10055 // is one, zero or negative one in the case of a CSEL. We can always
10056 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10057 // FVal, respectively.
10058 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
10059 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10060 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10062 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10063 // "a != C ? x : a" to avoid materializing C.
10064 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10065 TVal = LHS;
10066 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10067 FVal = LHS;
10068 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10069 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10070 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10071 // avoid materializing C.
10073 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10074 Opcode = AArch64ISD::CSINV;
10075 TVal = LHS;
10076 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10077 }
10078 }
10079
10080 SDValue CCVal;
10081 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10082 EVT VT = TVal.getValueType();
10083 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10084 }
10085
10086 // Now we know we're dealing with FP values.
10087 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10088 LHS.getValueType() == MVT::f64);
10089 assert(LHS.getValueType() == RHS.getValueType());
10090 EVT VT = TVal.getValueType();
10091 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10092
10093 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10094 // clean. Some of them require two CSELs to implement.
10095 AArch64CC::CondCode CC1, CC2;
10096 changeFPCCToAArch64CC(CC, CC1, CC2);
10097
10098 if (DAG.getTarget().Options.UnsafeFPMath) {
10099 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10100 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10101 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10102 if (RHSVal && RHSVal->isZero()) {
10103 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10104 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10105
10106 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10107 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10108 TVal = LHS;
10109 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10110 CFVal && CFVal->isZero() &&
10111 FVal.getValueType() == LHS.getValueType())
10112 FVal = LHS;
10113 }
10114 }
10115
10116 // Emit first, and possibly only, CSEL.
10117 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10118 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10119
10120 // If we need a second CSEL, emit it, using the output of the first as the
10121 // RHS. We're effectively OR'ing the two CC's together.
10122 if (CC2 != AArch64CC::AL) {
10123 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10124 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10125 }
10126
10127 // Otherwise, return the output of the first CSEL.
10128 return CS1;
10129}
10130
10131SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10132 SelectionDAG &DAG) const {
10133 EVT Ty = Op.getValueType();
10134 auto Idx = Op.getConstantOperandAPInt(2);
10135 int64_t IdxVal = Idx.getSExtValue();
10136 assert(Ty.isScalableVector() &&
10137 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10138
10139 // We can use the splice instruction for certain index values where we are
10140 // able to efficiently generate the correct predicate. The index will be
10141 // inverted and used directly as the input to the ptrue instruction, i.e.
10142 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10143 // splice predicate. However, we can only do this if we can guarantee that
10144 // there are enough elements in the vector, hence we check the index <= min
10145 // number of elements.
10146 std::optional<unsigned> PredPattern;
10147 if (Ty.isScalableVector() && IdxVal < 0 &&
10148 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10149 std::nullopt) {
10150 SDLoc DL(Op);
10151
10152 // Create a predicate where all but the last -IdxVal elements are false.
10153 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10154 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10155 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10156
10157 // Now splice the two inputs together using the predicate.
10158 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10159 Op.getOperand(1));
10160 }
10161
10162 // We can select to an EXT instruction when indexing the first 256 bytes.
10164 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
10165 return Op;
10166
10167 return SDValue();
10168}
10169
10170SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10171 SelectionDAG &DAG) const {
10172 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10173 SDValue LHS = Op.getOperand(0);
10174 SDValue RHS = Op.getOperand(1);
10175 SDValue TVal = Op.getOperand(2);
10176 SDValue FVal = Op.getOperand(3);
10177 SDLoc DL(Op);
10178 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10179}
10180
10181SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10182 SelectionDAG &DAG) const {
10183 SDValue CCVal = Op->getOperand(0);
10184 SDValue TVal = Op->getOperand(1);
10185 SDValue FVal = Op->getOperand(2);
10186 SDLoc DL(Op);
10187
10188 EVT Ty = Op.getValueType();
10189 if (Ty == MVT::aarch64svcount) {
10190 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10191 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10192 SDValue Sel =
10193 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10194 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10195 }
10196
10197 if (Ty.isScalableVector()) {
10198 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10199 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10200 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10201 }
10202
10203 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10204 // FIXME: Ideally this would be the same as above using i1 types, however
10205 // for the moment we can't deal with fixed i1 vector types properly, so
10206 // instead extend the predicate to a result type sized integer vector.
10207 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10208 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10209 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10210 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10211 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10212 }
10213
10214 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10215 // instruction.
10216 if (ISD::isOverflowIntrOpRes(CCVal)) {
10217 // Only lower legal XALUO ops.
10218 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10219 return SDValue();
10220
10222 SDValue Value, Overflow;
10223 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10224 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10225
10226 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10227 CCVal, Overflow);
10228 }
10229
10230 // Lower it the same way as we would lower a SELECT_CC node.
10232 SDValue LHS, RHS;
10233 if (CCVal.getOpcode() == ISD::SETCC) {
10234 LHS = CCVal.getOperand(0);
10235 RHS = CCVal.getOperand(1);
10236 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10237 } else {
10238 LHS = CCVal;
10239 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10240 CC = ISD::SETNE;
10241 }
10242
10243 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10244 // order to use FCSELSrrr
10245 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10246 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10247 DAG.getUNDEF(MVT::f32), TVal);
10248 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10249 DAG.getUNDEF(MVT::f32), FVal);
10250 }
10251
10252 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10253
10254 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10255 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10256 }
10257
10258 return Res;
10259}
10260
10261SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10262 SelectionDAG &DAG) const {
10263 // Jump table entries as PC relative offsets. No additional tweaking
10264 // is necessary here. Just get the address of the jump table.
10265 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10266
10269 !Subtarget->isTargetMachO())
10270 return getAddrLarge(JT, DAG);
10271 if (CM == CodeModel::Tiny)
10272 return getAddrTiny(JT, DAG);
10273 return getAddr(JT, DAG);
10274}
10275
10276SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10277 SelectionDAG &DAG) const {
10278 // Jump table entries as PC relative offsets. No additional tweaking
10279 // is necessary here. Just get the address of the jump table.
10280 SDLoc DL(Op);
10281 SDValue JT = Op.getOperand(1);
10282 SDValue Entry = Op.getOperand(2);
10283 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10284
10285 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10286 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10287
10288 SDNode *Dest =
10289 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10290 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10291 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10292 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10293}
10294
10295SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10296 SelectionDAG &DAG) const {
10297 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10299 if (CM == CodeModel::Large) {
10300 // Use the GOT for the large code model on iOS.
10301 if (Subtarget->isTargetMachO()) {
10302 return getGOT(CP, DAG);
10303 }
10305 return getAddrLarge(CP, DAG);
10306 } else if (CM == CodeModel::Tiny) {
10307 return getAddrTiny(CP, DAG);
10308 }
10309 return getAddr(CP, DAG);
10310}
10311
10312SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10313 SelectionDAG &DAG) const {
10314 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10316 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10318 return getAddrLarge(BA, DAG);
10319 } else if (CM == CodeModel::Tiny) {
10320 return getAddrTiny(BA, DAG);
10321 }
10322 return getAddr(BA, DAG);
10323}
10324
10325SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10326 SelectionDAG &DAG) const {
10327 AArch64FunctionInfo *FuncInfo =
10329
10330 SDLoc DL(Op);
10331 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10333 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10334 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10335 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10336 MachinePointerInfo(SV));
10337}
10338
10339SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10340 SelectionDAG &DAG) const {
10343
10344 SDLoc DL(Op);
10345 SDValue FR;
10346 if (Subtarget->isWindowsArm64EC()) {
10347 // With the Arm64EC ABI, we compute the address of the varargs save area
10348 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10349 // but calls from an entry thunk can pass in a different address.
10350 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10351 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10353 if (FuncInfo->getVarArgsGPRSize() > 0)
10354 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10355 else
10356 StackOffset = FuncInfo->getVarArgsStackOffset();
10357 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10358 DAG.getConstant(StackOffset, DL, MVT::i64));
10359 } else {
10360 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10361 ? FuncInfo->getVarArgsGPRIndex()
10362 : FuncInfo->getVarArgsStackIndex(),
10364 }
10365 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10366 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10367 MachinePointerInfo(SV));
10368}
10369
10370SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10371 SelectionDAG &DAG) const {
10372 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10373 // Standard, section B.3.
10376 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10377 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10378 auto PtrVT = getPointerTy(DAG.getDataLayout());
10379 SDLoc DL(Op);
10380
10381 SDValue Chain = Op.getOperand(0);
10382 SDValue VAList = Op.getOperand(1);
10383 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10385
10386 // void *__stack at offset 0
10387 unsigned Offset = 0;
10388 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10389 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10390 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10391 MachinePointerInfo(SV), Align(PtrSize)));
10392
10393 // void *__gr_top at offset 8 (4 on ILP32)
10394 Offset += PtrSize;
10395 int GPRSize = FuncInfo->getVarArgsGPRSize();
10396 if (GPRSize > 0) {
10397 SDValue GRTop, GRTopAddr;
10398
10399 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10400 DAG.getConstant(Offset, DL, PtrVT));
10401
10402 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10403 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10404 DAG.getConstant(GPRSize, DL, PtrVT));
10405 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10406
10407 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10409 Align(PtrSize)));
10410 }
10411
10412 // void *__vr_top at offset 16 (8 on ILP32)
10413 Offset += PtrSize;
10414 int FPRSize = FuncInfo->getVarArgsFPRSize();
10415 if (FPRSize > 0) {
10416 SDValue VRTop, VRTopAddr;
10417 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10418 DAG.getConstant(Offset, DL, PtrVT));
10419
10420 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10421 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10422 DAG.getConstant(FPRSize, DL, PtrVT));
10423 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10424
10425 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10427 Align(PtrSize)));
10428 }
10429
10430 // int __gr_offs at offset 24 (12 on ILP32)
10431 Offset += PtrSize;
10432 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10433 DAG.getConstant(Offset, DL, PtrVT));
10434 MemOps.push_back(
10435 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10436 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10437
10438 // int __vr_offs at offset 28 (16 on ILP32)
10439 Offset += 4;
10440 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10441 DAG.getConstant(Offset, DL, PtrVT));
10442 MemOps.push_back(
10443 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10444 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10445
10446 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10447}
10448
10449SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10450 SelectionDAG &DAG) const {
10452
10453 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10454 return LowerWin64_VASTART(Op, DAG);
10455 else if (Subtarget->isTargetDarwin())
10456 return LowerDarwin_VASTART(Op, DAG);
10457 else
10458 return LowerAAPCS_VASTART(Op, DAG);
10459}
10460
10461SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10462 SelectionDAG &DAG) const {
10463 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10464 // pointer.
10465 SDLoc DL(Op);
10466 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10467 unsigned VaListSize =
10468 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10469 ? PtrSize
10470 : Subtarget->isTargetILP32() ? 20 : 32;
10471 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10472 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10473
10474 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10475 DAG.getConstant(VaListSize, DL, MVT::i32),
10476 Align(PtrSize), false, false, false,
10477 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10478}
10479
10480SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10481 assert(Subtarget->isTargetDarwin() &&
10482 "automatic va_arg instruction only works on Darwin");
10483
10484 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10485 EVT VT = Op.getValueType();
10486 SDLoc DL(Op);
10487 SDValue Chain = Op.getOperand(0);
10488 SDValue Addr = Op.getOperand(1);
10489 MaybeAlign Align(Op.getConstantOperandVal(3));
10490 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10491 auto PtrVT = getPointerTy(DAG.getDataLayout());
10492 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10493 SDValue VAList =
10494 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10495 Chain = VAList.getValue(1);
10496 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10497
10498 if (VT.isScalableVector())
10499 report_fatal_error("Passing SVE types to variadic functions is "
10500 "currently not supported");
10501
10502 if (Align && *Align > MinSlotSize) {
10503 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10504 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10505 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10506 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10507 }
10508
10509 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10510 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10511
10512 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10513 // up to 64 bits. At the very least, we have to increase the striding of the
10514 // vaargs list to match this, and for FP values we need to introduce
10515 // FP_ROUND nodes as well.
10516 if (VT.isInteger() && !VT.isVector())
10517 ArgSize = std::max(ArgSize, MinSlotSize);
10518 bool NeedFPTrunc = false;
10519 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10520 ArgSize = 8;
10521 NeedFPTrunc = true;
10522 }
10523
10524 // Increment the pointer, VAList, to the next vaarg
10525 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10526 DAG.getConstant(ArgSize, DL, PtrVT));
10527 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10528
10529 // Store the incremented VAList to the legalized pointer
10530 SDValue APStore =
10531 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10532
10533 // Load the actual argument out of the pointer VAList
10534 if (NeedFPTrunc) {
10535 // Load the value as an f64.
10536 SDValue WideFP =
10537 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10538 // Round the value down to an f32.
10539 SDValue NarrowFP =
10540 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10541 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10542 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10543 // Merge the rounded value with the chain output of the load.
10544 return DAG.getMergeValues(Ops, DL);
10545 }
10546
10547 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10548}
10549
10550SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10551 SelectionDAG &DAG) const {
10553 MFI.setFrameAddressIsTaken(true);
10554
10555 EVT VT = Op.getValueType();
10556 SDLoc DL(Op);
10557 unsigned Depth = Op.getConstantOperandVal(0);
10558 SDValue FrameAddr =
10559 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10560 while (Depth--)
10561 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10563
10564 if (Subtarget->isTargetILP32())
10565 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10566 DAG.getValueType(VT));
10567
10568 return FrameAddr;
10569}
10570
10571SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10572 SelectionDAG &DAG) const {
10574
10575 EVT VT = getPointerTy(DAG.getDataLayout());
10576 SDLoc DL(Op);
10577 int FI = MFI.CreateFixedObject(4, 0, false);
10578 return DAG.getFrameIndex(FI, VT);
10579}
10580
10581#define GET_REGISTER_MATCHER
10582#include "AArch64GenAsmMatcher.inc"
10583
10584// FIXME? Maybe this could be a TableGen attribute on some registers and
10585// this table could be generated automatically from RegInfo.
10586Register AArch64TargetLowering::
10587getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10589 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10590 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10591 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10592 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10593 !MRI->isReservedReg(MF, Reg))
10594 Reg = 0;
10595 }
10596 if (Reg)
10597 return Reg;
10598 report_fatal_error(Twine("Invalid register name \""
10599 + StringRef(RegName) + "\"."));
10600}
10601
10602SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10603 SelectionDAG &DAG) const {
10605
10606 EVT VT = Op.getValueType();
10607 SDLoc DL(Op);
10608
10609 SDValue FrameAddr =
10610 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10612
10613 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10614}
10615
10616SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10617 SelectionDAG &DAG) const {
10619 MachineFrameInfo &MFI = MF.getFrameInfo();
10620 MFI.setReturnAddressIsTaken(true);
10621
10622 EVT VT = Op.getValueType();
10623 SDLoc DL(Op);
10624 unsigned Depth = Op.getConstantOperandVal(0);
10625 SDValue ReturnAddress;
10626 if (Depth) {
10627 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10629 ReturnAddress = DAG.getLoad(
10630 VT, DL, DAG.getEntryNode(),
10631 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10632 } else {
10633 // Return LR, which contains the return address. Mark it an implicit
10634 // live-in.
10635 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10636 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10637 }
10638
10639 // The XPACLRI instruction assembles to a hint-space instruction before
10640 // Armv8.3-A therefore this instruction can be safely used for any pre
10641 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10642 // that instead.
10643 SDNode *St;
10644 if (Subtarget->hasPAuth()) {
10645 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10646 } else {
10647 // XPACLRI operates on LR therefore we must move the operand accordingly.
10648 SDValue Chain =
10649 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10650 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10651 }
10652 return SDValue(St, 0);
10653}
10654
10655/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10656/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10657SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10658 SelectionDAG &DAG) const {
10659 SDValue Lo, Hi;
10660 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10661 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10662}
10663
10665 const GlobalAddressSDNode *GA) const {
10666 // Offsets are folded in the DAG combine rather than here so that we can
10667 // intelligently choose an offset based on the uses.
10668 return false;
10669}
10670
10672 bool OptForSize) const {
10673 bool IsLegal = false;
10674 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10675 // 16-bit case when target has full fp16 support.
10676 // We encode bf16 bit patterns as if they were fp16. This results in very
10677 // strange looking assembly but should populate the register with appropriate
10678 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10679 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10680 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10681 // FIXME: We should be able to handle f128 as well with a clever lowering.
10682 const APInt ImmInt = Imm.bitcastToAPInt();
10683 if (VT == MVT::f64)
10684 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10685 else if (VT == MVT::f32)
10686 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10687 else if (VT == MVT::f16 || VT == MVT::bf16)
10688 IsLegal =
10689 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10690 Imm.isPosZero();
10691
10692 // If we can not materialize in immediate field for fmov, check if the
10693 // value can be encoded as the immediate operand of a logical instruction.
10694 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10695 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10696 // generate that fmov.
10697 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10698 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10699 // however the mov+fmov sequence is always better because of the reduced
10700 // cache pressure. The timings are still the same if you consider
10701 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10702 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10705 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10706 IsLegal = Insn.size() <= Limit;
10707 }
10708
10709 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10710 << " imm value: "; Imm.dump(););
10711 return IsLegal;
10712}
10713
10714//===----------------------------------------------------------------------===//
10715// AArch64 Optimization Hooks
10716//===----------------------------------------------------------------------===//
10717
10718static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10719 SDValue Operand, SelectionDAG &DAG,
10720 int &ExtraSteps) {
10721 EVT VT = Operand.getValueType();
10722 if ((ST->hasNEON() &&
10723 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10724 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10725 VT == MVT::v4f32)) ||
10726 (ST->hasSVE() &&
10727 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10729 // For the reciprocal estimates, convergence is quadratic, so the number
10730 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10731 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10732 // the result for float (23 mantissa bits) is 2 and for double (52
10733 // mantissa bits) is 3.
10734 constexpr unsigned AccurateBits = 8;
10735 unsigned DesiredBits =
10737 ExtraSteps = DesiredBits <= AccurateBits
10738 ? 0
10739 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10740 }
10741
10742 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10743 }
10744
10745 return SDValue();
10746}
10747
10748SDValue
10749AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10750 const DenormalMode &Mode) const {
10751 SDLoc DL(Op);
10752 EVT VT = Op.getValueType();
10753 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10754 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10755 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10756}
10757
10758SDValue
10759AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10760 SelectionDAG &DAG) const {
10761 return Op;
10762}
10763
10764SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10765 SelectionDAG &DAG, int Enabled,
10766 int &ExtraSteps,
10767 bool &UseOneConst,
10768 bool Reciprocal) const {
10770 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10771 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10772 DAG, ExtraSteps)) {
10773 SDLoc DL(Operand);
10774 EVT VT = Operand.getValueType();
10775
10777 Flags.setAllowReassociation(true);
10778
10779 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10780 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10781 for (int i = ExtraSteps; i > 0; --i) {
10782 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10783 Flags);
10784 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10785 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10786 }
10787 if (!Reciprocal)
10788 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10789
10790 ExtraSteps = 0;
10791 return Estimate;
10792 }
10793
10794 return SDValue();
10795}
10796
10797SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10798 SelectionDAG &DAG, int Enabled,
10799 int &ExtraSteps) const {
10801 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10802 DAG, ExtraSteps)) {
10803 SDLoc DL(Operand);
10804 EVT VT = Operand.getValueType();
10805
10807 Flags.setAllowReassociation(true);
10808
10809 // Newton reciprocal iteration: E * (2 - X * E)
10810 // AArch64 reciprocal iteration instruction: (2 - M * N)
10811 for (int i = ExtraSteps; i > 0; --i) {
10812 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10813 Estimate, Flags);
10814 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10815 }
10816
10817 ExtraSteps = 0;
10818 return Estimate;
10819 }
10820
10821 return SDValue();
10822}
10823
10824//===----------------------------------------------------------------------===//
10825// AArch64 Inline Assembly Support
10826//===----------------------------------------------------------------------===//
10827
10828// Table of Constraints
10829// TODO: This is the current set of constraints supported by ARM for the
10830// compiler, not all of them may make sense.
10831//
10832// r - A general register
10833// w - An FP/SIMD register of some size in the range v0-v31
10834// x - An FP/SIMD register of some size in the range v0-v15
10835// I - Constant that can be used with an ADD instruction
10836// J - Constant that can be used with a SUB instruction
10837// K - Constant that can be used with a 32-bit logical instruction
10838// L - Constant that can be used with a 64-bit logical instruction
10839// M - Constant that can be used as a 32-bit MOV immediate
10840// N - Constant that can be used as a 64-bit MOV immediate
10841// Q - A memory reference with base register and no offset
10842// S - A symbolic address
10843// Y - Floating point constant zero
10844// Z - Integer constant zero
10845//
10846// Note that general register operands will be output using their 64-bit x
10847// register name, whatever the size of the variable, unless the asm operand
10848// is prefixed by the %w modifier. Floating-point and SIMD register operands
10849// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10850// %q modifier.
10851const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10852 // At this point, we have to lower this constraint to something else, so we
10853 // lower it to an "r" or "w". However, by doing this we will force the result
10854 // to be in register, while the X constraint is much more permissive.
10855 //
10856 // Although we are correct (we are free to emit anything, without
10857 // constraints), we might break use cases that would expect us to be more
10858 // efficient and emit something else.
10859 if (!Subtarget->hasFPARMv8())
10860 return "r";
10861
10862 if (ConstraintVT.isFloatingPoint())
10863 return "w";
10864
10865 if (ConstraintVT.isVector() &&
10866 (ConstraintVT.getSizeInBits() == 64 ||
10867 ConstraintVT.getSizeInBits() == 128))
10868 return "w";
10869
10870 return "r";
10871}
10872
10874
10875static std::optional<PredicateConstraint>
10878 .Case("Uph", PredicateConstraint::Uph)
10879 .Case("Upl", PredicateConstraint::Upl)
10880 .Case("Upa", PredicateConstraint::Upa)
10881 .Default(std::nullopt);
10882}
10883
10884static const TargetRegisterClass *
10886 if (VT != MVT::aarch64svcount &&
10887 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10888 return nullptr;
10889
10890 switch (Constraint) {
10891 case PredicateConstraint::Uph:
10892 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10893 : &AArch64::PPR_p8to15RegClass;
10894 case PredicateConstraint::Upl:
10895 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10896 : &AArch64::PPR_3bRegClass;
10897 case PredicateConstraint::Upa:
10898 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10899 : &AArch64::PPRRegClass;
10900 }
10901
10902 llvm_unreachable("Missing PredicateConstraint!");
10903}
10904
10906
10907static std::optional<ReducedGprConstraint>
10910 .Case("Uci", ReducedGprConstraint::Uci)
10911 .Case("Ucj", ReducedGprConstraint::Ucj)
10912 .Default(std::nullopt);
10913}
10914
10915static const TargetRegisterClass *
10917 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10918 return nullptr;
10919
10920 switch (Constraint) {
10921 case ReducedGprConstraint::Uci:
10922 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10923 case ReducedGprConstraint::Ucj:
10924 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10925 }
10926
10927 llvm_unreachable("Missing ReducedGprConstraint!");
10928}
10929
10930// The set of cc code supported is from
10931// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10934 .Case("{@cchi}", AArch64CC::HI)
10935 .Case("{@cccs}", AArch64CC::HS)
10936 .Case("{@cclo}", AArch64CC::LO)
10937 .Case("{@ccls}", AArch64CC::LS)
10938 .Case("{@cccc}", AArch64CC::LO)
10939 .Case("{@cceq}", AArch64CC::EQ)
10940 .Case("{@ccgt}", AArch64CC::GT)
10941 .Case("{@ccge}", AArch64CC::GE)
10942 .Case("{@cclt}", AArch64CC::LT)
10943 .Case("{@ccle}", AArch64CC::LE)
10944 .Case("{@cchs}", AArch64CC::HS)
10945 .Case("{@ccne}", AArch64CC::NE)
10946 .Case("{@ccvc}", AArch64CC::VC)
10947 .Case("{@ccpl}", AArch64CC::PL)
10948 .Case("{@ccvs}", AArch64CC::VS)
10949 .Case("{@ccmi}", AArch64CC::MI)
10951 return Cond;
10952}
10953
10954/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10955/// WZR, invert(<cond>)'.
10957 SelectionDAG &DAG) {
10958 return DAG.getNode(
10959 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10960 DAG.getConstant(0, DL, MVT::i32),
10961 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10962}
10963
10964// Lower @cc flag output via getSETCC.
10965SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10966 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10967 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10968 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10969 if (Cond == AArch64CC::Invalid)
10970 return SDValue();
10971 // The output variable should be a scalar integer.
10972 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10973 OpInfo.ConstraintVT.getSizeInBits() < 8)
10974 report_fatal_error("Flag output operand is of invalid type");
10975
10976 // Get NZCV register. Only update chain when copyfrom is glued.
10977 if (Glue.getNode()) {
10978 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10979 Chain = Glue.getValue(1);
10980 } else
10981 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10982 // Extract CC code.
10983 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10984
10986
10987 // Truncate or ZERO_EXTEND based on value types.
10988 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10989 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10990 else
10991 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10992
10993 return Result;
10994}
10995
10996/// getConstraintType - Given a constraint letter, return the type of
10997/// constraint it is for this target.
10999AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
11000 if (Constraint.size() == 1) {
11001 switch (Constraint[0]) {
11002 default:
11003 break;
11004 case 'x':
11005 case 'w':
11006 case 'y':
11007 return C_RegisterClass;
11008 // An address with a single base register. Due to the way we
11009 // currently handle addresses it is the same as 'r'.
11010 case 'Q':
11011 return C_Memory;
11012 case 'I':
11013 case 'J':
11014 case 'K':
11015 case 'L':
11016 case 'M':
11017 case 'N':
11018 case 'Y':
11019 case 'Z':
11020 return C_Immediate;
11021 case 'z':
11022 case 'S': // A symbol or label reference with a constant offset
11023 return C_Other;
11024 }
11025 } else if (parsePredicateConstraint(Constraint))
11026 return C_RegisterClass;
11027 else if (parseReducedGprConstraint(Constraint))
11028 return C_RegisterClass;
11029 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11030 return C_Other;
11031 return TargetLowering::getConstraintType(Constraint);
11032}
11033
11034/// Examine constraint type and operand type and determine a weight value.
11035/// This object must already have been set up with the operand type
11036/// and the current alternative constraint selected.
11038AArch64TargetLowering::getSingleConstraintMatchWeight(
11039 AsmOperandInfo &info, const char *constraint) const {
11041 Value *CallOperandVal = info.CallOperandVal;
11042 // If we don't have a value, we can't do a match,
11043 // but allow it at the lowest weight.
11044 if (!CallOperandVal)
11045 return CW_Default;
11046 Type *type = CallOperandVal->getType();
11047 // Look at the constraint type.
11048 switch (*constraint) {
11049 default:
11051 break;
11052 case 'x':
11053 case 'w':
11054 case 'y':
11055 if (type->isFloatingPointTy() || type->isVectorTy())
11056 weight = CW_Register;
11057 break;
11058 case 'z':
11059 weight = CW_Constant;
11060 break;
11061 case 'U':
11062 if (parsePredicateConstraint(constraint) ||
11063 parseReducedGprConstraint(constraint))
11064 weight = CW_Register;
11065 break;
11066 }
11067 return weight;
11068}
11069
11070std::pair<unsigned, const TargetRegisterClass *>
11071AArch64TargetLowering::getRegForInlineAsmConstraint(
11072 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11073 if (Constraint.size() == 1) {
11074 switch (Constraint[0]) {
11075 case 'r':
11076 if (VT.isScalableVector())
11077 return std::make_pair(0U, nullptr);
11078 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11079 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11080 if (VT.getFixedSizeInBits() == 64)
11081 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11082 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11083 case 'w': {
11084 if (!Subtarget->hasFPARMv8())
11085 break;
11086 if (VT.isScalableVector()) {
11087 if (VT.getVectorElementType() != MVT::i1)
11088 return std::make_pair(0U, &AArch64::ZPRRegClass);
11089 return std::make_pair(0U, nullptr);
11090 }
11091 uint64_t VTSize = VT.getFixedSizeInBits();
11092 if (VTSize == 16)
11093 return std::make_pair(0U, &AArch64::FPR16RegClass);
11094 if (VTSize == 32)
11095 return std::make_pair(0U, &AArch64::FPR32RegClass);
11096 if (VTSize == 64)
11097 return std::make_pair(0U, &AArch64::FPR64RegClass);
11098 if (VTSize == 128)
11099 return std::make_pair(0U, &AArch64::FPR128RegClass);
11100 break;
11101 }
11102 // The instructions that this constraint is designed for can
11103 // only take 128-bit registers so just use that regclass.
11104 case 'x':
11105 if (!Subtarget->hasFPARMv8())
11106 break;
11107 if (VT.isScalableVector())
11108 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11109 if (VT.getSizeInBits() == 128)
11110 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11111 break;
11112 case 'y':
11113 if (!Subtarget->hasFPARMv8())
11114 break;
11115 if (VT.isScalableVector())
11116 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11117 break;
11118 }
11119 } else {
11120 if (const auto PC = parsePredicateConstraint(Constraint))
11121 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11122 return std::make_pair(0U, RegClass);
11123
11124 if (const auto RGC = parseReducedGprConstraint(Constraint))
11125 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11126 return std::make_pair(0U, RegClass);
11127 }
11128 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11130 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11131
11132 if (Constraint == "{za}") {
11133 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11134 }
11135
11136 if (Constraint == "{zt0}") {
11137 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11138 }
11139
11140 // Use the default implementation in TargetLowering to convert the register
11141 // constraint into a member of a register class.
11142 std::pair<unsigned, const TargetRegisterClass *> Res;
11144
11145 // Not found as a standard register?
11146 if (!Res.second) {
11147 unsigned Size = Constraint.size();
11148 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11149 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11150 int RegNo;
11151 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11152 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11153 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11154 // By default we'll emit v0-v31 for this unless there's a modifier where
11155 // we'll emit the correct register as well.
11156 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11157 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11158 Res.second = &AArch64::FPR64RegClass;
11159 } else {
11160 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11161 Res.second = &AArch64::FPR128RegClass;
11162 }
11163 }
11164 }
11165 }
11166
11167 if (Res.second && !Subtarget->hasFPARMv8() &&
11168 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11169 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11170 return std::make_pair(0U, nullptr);
11171
11172 return Res;
11173}
11174
11176 llvm::Type *Ty,
11177 bool AllowUnknown) const {
11178 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11179 return EVT(MVT::i64x8);
11180
11181 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11182}
11183
11184/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11185/// vector. If it is invalid, don't add anything to Ops.
11186void AArch64TargetLowering::LowerAsmOperandForConstraint(
11187 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11188 SelectionDAG &DAG) const {
11189 SDValue Result;
11190
11191 // Currently only support length 1 constraints.
11192 if (Constraint.size() != 1)
11193 return;
11194
11195 char ConstraintLetter = Constraint[0];
11196 switch (ConstraintLetter) {
11197 default:
11198 break;
11199
11200 // This set of constraints deal with valid constants for various instructions.
11201 // Validate and return a target constant for them if we can.
11202 case 'z': {
11203 // 'z' maps to xzr or wzr so it needs an input of 0.
11204 if (!isNullConstant(Op))
11205 return;
11206
11207 if (Op.getValueType() == MVT::i64)
11208 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11209 else
11210 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11211 break;
11212 }
11213 case 'S':
11214 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11215 // supported for PIC while "s" isn't, making "s" less useful. We implement
11216 // "S" but not "s".
11218 break;
11219
11220 case 'I':
11221 case 'J':
11222 case 'K':
11223 case 'L':
11224 case 'M':
11225 case 'N':
11226 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11227 if (!C)
11228 return;
11229
11230 // Grab the value and do some validation.
11231 uint64_t CVal = C->getZExtValue();
11232 switch (ConstraintLetter) {
11233 // The I constraint applies only to simple ADD or SUB immediate operands:
11234 // i.e. 0 to 4095 with optional shift by 12
11235 // The J constraint applies only to ADD or SUB immediates that would be
11236 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11237 // instruction [or vice versa], in other words -1 to -4095 with optional
11238 // left shift by 12.
11239 case 'I':
11240 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11241 break;
11242 return;
11243 case 'J': {
11244 uint64_t NVal = -C->getSExtValue();
11245 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11246 CVal = C->getSExtValue();
11247 break;
11248 }
11249 return;
11250 }
11251 // The K and L constraints apply *only* to logical immediates, including
11252 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11253 // been removed and MOV should be used). So these constraints have to
11254 // distinguish between bit patterns that are valid 32-bit or 64-bit
11255 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11256 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11257 // versa.
11258 case 'K':
11259 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11260 break;
11261 return;
11262 case 'L':
11263 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11264 break;
11265 return;
11266 // The M and N constraints are a superset of K and L respectively, for use
11267 // with the MOV (immediate) alias. As well as the logical immediates they
11268 // also match 32 or 64-bit immediates that can be loaded either using a
11269 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11270 // (M) or 64-bit 0x1234000000000000 (N) etc.
11271 // As a note some of this code is liberally stolen from the asm parser.
11272 case 'M': {
11273 if (!isUInt<32>(CVal))
11274 return;
11275 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11276 break;
11277 if ((CVal & 0xFFFF) == CVal)
11278 break;
11279 if ((CVal & 0xFFFF0000ULL) == CVal)
11280 break;
11281 uint64_t NCVal = ~(uint32_t)CVal;
11282 if ((NCVal & 0xFFFFULL) == NCVal)
11283 break;
11284 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11285 break;
11286 return;
11287 }
11288 case 'N': {
11289 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11290 break;
11291 if ((CVal & 0xFFFFULL) == CVal)
11292 break;
11293 if ((CVal & 0xFFFF0000ULL) == CVal)
11294 break;
11295 if ((CVal & 0xFFFF00000000ULL) == CVal)
11296 break;
11297 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11298 break;
11299 uint64_t NCVal = ~CVal;
11300 if ((NCVal & 0xFFFFULL) == NCVal)
11301 break;
11302 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11303 break;
11304 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11305 break;
11306 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11307 break;
11308 return;
11309 }
11310 default:
11311 return;
11312 }
11313
11314 // All assembler immediates are 64-bit integers.
11315 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11316 break;
11317 }
11318
11319 if (Result.getNode()) {
11320 Ops.push_back(Result);
11321 return;
11322 }
11323
11324 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11325}
11326
11327//===----------------------------------------------------------------------===//
11328// AArch64 Advanced SIMD Support
11329//===----------------------------------------------------------------------===//
11330
11331/// WidenVector - Given a value in the V64 register class, produce the
11332/// equivalent value in the V128 register class.
11334 EVT VT = V64Reg.getValueType();
11335 unsigned NarrowSize = VT.getVectorNumElements();
11336 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11337 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11338 SDLoc DL(V64Reg);
11339
11340 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11341 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11342}
11343
11344/// getExtFactor - Determine the adjustment factor for the position when
11345/// generating an "extract from vector registers" instruction.
11346static unsigned getExtFactor(SDValue &V) {
11347 EVT EltType = V.getValueType().getVectorElementType();
11348 return EltType.getSizeInBits() / 8;
11349}
11350
11351// Check if a vector is built from one vector via extracted elements of
11352// another together with an AND mask, ensuring that all elements fit
11353// within range. This can be reconstructed using AND and NEON's TBL1.
11355 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11356 SDLoc dl(Op);
11357 EVT VT = Op.getValueType();
11358 assert(!VT.isScalableVector() &&
11359 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11360
11361 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11362 // directly to TBL1.
11363 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11364 return SDValue();
11365
11366 unsigned NumElts = VT.getVectorNumElements();
11367 assert((NumElts == 8 || NumElts == 16) &&
11368 "Need to have exactly 8 or 16 elements in vector.");
11369
11370 SDValue SourceVec;
11371 SDValue MaskSourceVec;
11372 SmallVector<SDValue, 16> AndMaskConstants;
11373
11374 for (unsigned i = 0; i < NumElts; ++i) {
11375 SDValue V = Op.getOperand(i);
11376 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11377 return SDValue();
11378
11379 SDValue OperandSourceVec = V.getOperand(0);
11380 if (!SourceVec)
11381 SourceVec = OperandSourceVec;
11382 else if (SourceVec != OperandSourceVec)
11383 return SDValue();
11384
11385 // This only looks at shuffles with elements that are
11386 // a) truncated by a constant AND mask extracted from a mask vector, or
11387 // b) extracted directly from a mask vector.
11388 SDValue MaskSource = V.getOperand(1);
11389 if (MaskSource.getOpcode() == ISD::AND) {
11390 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11391 return SDValue();
11392
11393 AndMaskConstants.push_back(MaskSource.getOperand(1));
11394 MaskSource = MaskSource->getOperand(0);
11395 } else if (!AndMaskConstants.empty()) {
11396 // Either all or no operands should have an AND mask.
11397 return SDValue();
11398 }
11399
11400 // An ANY_EXTEND may be inserted between the AND and the source vector
11401 // extraction. We don't care about that, so we can just skip it.
11402 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11403 MaskSource = MaskSource.getOperand(0);
11404
11405 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11406 return SDValue();
11407
11408 SDValue MaskIdx = MaskSource.getOperand(1);
11409 if (!isa<ConstantSDNode>(MaskIdx) ||
11410 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11411 return SDValue();
11412
11413 // We only apply this if all elements come from the same vector with the
11414 // same vector type.
11415 if (!MaskSourceVec) {
11416 MaskSourceVec = MaskSource->getOperand(0);
11417 if (MaskSourceVec.getValueType() != VT)
11418 return SDValue();
11419 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11420 return SDValue();
11421 }
11422 }
11423
11424 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11425 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11426 // insert, we know that the index in the mask must be smaller than the number
11427 // of elements in the source, or we would have an out-of-bounds access.
11428 if (NumElts == 8)
11429 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11430 DAG.getUNDEF(VT));
11431
11432 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11433 if (!AndMaskConstants.empty())
11434 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11435 DAG.getBuildVector(VT, dl, AndMaskConstants));
11436
11437 return DAG.getNode(
11439 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11440 MaskSourceVec);
11441}
11442
11443// Gather data to see if the operation can be modelled as a
11444// shuffle in combination with VEXTs.
11446 SelectionDAG &DAG) const {
11447 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11448 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11449 SDLoc dl(Op);
11450 EVT VT = Op.getValueType();
11451 assert(!VT.isScalableVector() &&
11452 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11453 unsigned NumElts = VT.getVectorNumElements();
11454
11455 struct ShuffleSourceInfo {
11456 SDValue Vec;
11457 unsigned MinElt;
11458 unsigned MaxElt;
11459
11460 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11461 // be compatible with the shuffle we intend to construct. As a result
11462 // ShuffleVec will be some sliding window into the original Vec.
11463 SDValue ShuffleVec;
11464
11465 // Code should guarantee that element i in Vec starts at element "WindowBase
11466 // + i * WindowScale in ShuffleVec".
11467 int WindowBase;
11468 int WindowScale;
11469
11470 ShuffleSourceInfo(SDValue Vec)
11471 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11472 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11473
11474 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11475 };
11476
11477 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11478 // node.
11480 for (unsigned i = 0; i < NumElts; ++i) {
11481 SDValue V = Op.getOperand(i);
11482 if (V.isUndef())
11483 continue;
11484 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11485 !isa<ConstantSDNode>(V.getOperand(1)) ||
11486 V.getOperand(0).getValueType().isScalableVector()) {
11487 LLVM_DEBUG(
11488 dbgs() << "Reshuffle failed: "
11489 "a shuffle can only come from building a vector from "
11490 "various elements of other fixed-width vectors, provided "
11491 "their indices are constant\n");
11492 return SDValue();
11493 }
11494
11495 // Add this element source to the list if it's not already there.
11496 SDValue SourceVec = V.getOperand(0);
11497 auto Source = find(Sources, SourceVec);
11498 if (Source == Sources.end())
11499 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11500
11501 // Update the minimum and maximum lane number seen.
11502 unsigned EltNo = V.getConstantOperandVal(1);
11503 Source->MinElt = std::min(Source->MinElt, EltNo);
11504 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11505 }
11506
11507 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11508 // better than moving to/from gpr registers for larger vectors.
11509 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11510 // Construct a mask for the tbl. We may need to adjust the index for types
11511 // larger than i8.
11513 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11514 for (unsigned I = 0; I < NumElts; ++I) {
11515 SDValue V = Op.getOperand(I);
11516 if (V.isUndef()) {
11517 for (unsigned OF = 0; OF < OutputFactor; OF++)
11518 Mask.push_back(-1);
11519 continue;
11520 }
11521 // Set the Mask lanes adjusted for the size of the input and output
11522 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11523 // output element, adjusted in their positions per input and output types.
11524 unsigned Lane = V.getConstantOperandVal(1);
11525 for (unsigned S = 0; S < Sources.size(); S++) {
11526 if (V.getOperand(0) == Sources[S].Vec) {
11527 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11528 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11529 for (unsigned OF = 0; OF < OutputFactor; OF++)
11530 Mask.push_back(InputBase + OF);
11531 break;
11532 }
11533 }
11534 }
11535
11536 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11537 // v16i8, and the TBLMask
11538 SmallVector<SDValue, 16> TBLOperands;
11539 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11540 ? Intrinsic::aarch64_neon_tbl3
11541 : Intrinsic::aarch64_neon_tbl4,
11542 dl, MVT::i32));
11543 for (unsigned i = 0; i < Sources.size(); i++) {
11544 SDValue Src = Sources[i].Vec;
11545 EVT SrcVT = Src.getValueType();
11546 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11547 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11548 "Expected a legally typed vector");
11549 if (SrcVT.is64BitVector())
11550 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11551 DAG.getUNDEF(MVT::v8i8));
11552 TBLOperands.push_back(Src);
11553 }
11554
11556 for (unsigned i = 0; i < Mask.size(); i++)
11557 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11558 assert((Mask.size() == 8 || Mask.size() == 16) &&
11559 "Expected a v8i8 or v16i8 Mask");
11560 TBLOperands.push_back(
11561 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11562
11563 SDValue Shuffle =
11565 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11566 return DAG.getBitcast(VT, Shuffle);
11567 }
11568
11569 if (Sources.size() > 2) {
11570 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11571 << "sensible when at most two source vectors are "
11572 << "involved\n");
11573 return SDValue();
11574 }
11575
11576 // Find out the smallest element size among result and two sources, and use
11577 // it as element size to build the shuffle_vector.
11578 EVT SmallestEltTy = VT.getVectorElementType();
11579 for (auto &Source : Sources) {
11580 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11581 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11582 SmallestEltTy = SrcEltTy;
11583 }
11584 }
11585 unsigned ResMultiplier =
11586 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11587 uint64_t VTSize = VT.getFixedSizeInBits();
11588 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11589 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11590
11591 // If the source vector is too wide or too narrow, we may nevertheless be able
11592 // to construct a compatible shuffle either by concatenating it with UNDEF or
11593 // extracting a suitable range of elements.
11594 for (auto &Src : Sources) {
11595 EVT SrcVT = Src.ShuffleVec.getValueType();
11596
11597 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11598 if (SrcVTSize == TypeSize::getFixed(VTSize))
11599 continue;
11600
11601 // This stage of the search produces a source with the same element type as
11602 // the original, but with a total width matching the BUILD_VECTOR output.
11603 EVT EltVT = SrcVT.getVectorElementType();
11604 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11605 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11606
11607 if (SrcVTSize.getFixedValue() < VTSize) {
11608 assert(2 * SrcVTSize == VTSize);
11609 // We can pad out the smaller vector for free, so if it's part of a
11610 // shuffle...
11611 Src.ShuffleVec =
11612 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11613 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11614 continue;
11615 }
11616
11617 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11618 LLVM_DEBUG(
11619 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11620 return SDValue();
11621 }
11622
11623 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11624 LLVM_DEBUG(
11625 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11626 return SDValue();
11627 }
11628
11629 if (Src.MinElt >= NumSrcElts) {
11630 // The extraction can just take the second half
11631 Src.ShuffleVec =
11632 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11633 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11634 Src.WindowBase = -NumSrcElts;
11635 } else if (Src.MaxElt < NumSrcElts) {
11636 // The extraction can just take the first half
11637 Src.ShuffleVec =
11638 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11639 DAG.getConstant(0, dl, MVT::i64));
11640 } else {
11641 // An actual VEXT is needed
11642 SDValue VEXTSrc1 =
11643 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11644 DAG.getConstant(0, dl, MVT::i64));
11645 SDValue VEXTSrc2 =
11646 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11647 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11648 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11649
11650 if (!SrcVT.is64BitVector()) {
11651 LLVM_DEBUG(
11652 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11653 "for SVE vectors.");
11654 return SDValue();
11655 }
11656
11657 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11658 VEXTSrc2,
11659 DAG.getConstant(Imm, dl, MVT::i32));
11660 Src.WindowBase = -Src.MinElt;
11661 }
11662 }
11663
11664 // Another possible incompatibility occurs from the vector element types. We
11665 // can fix this by bitcasting the source vectors to the same type we intend
11666 // for the shuffle.
11667 for (auto &Src : Sources) {
11668 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11669 if (SrcEltTy == SmallestEltTy)
11670 continue;
11671 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11672 if (DAG.getDataLayout().isBigEndian()) {
11673 Src.ShuffleVec =
11674 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11675 } else {
11676 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11677 }
11678 Src.WindowScale =
11679 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11680 Src.WindowBase *= Src.WindowScale;
11681 }
11682
11683 // Final check before we try to actually produce a shuffle.
11684 LLVM_DEBUG(for (auto Src
11685 : Sources)
11686 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11687
11688 // The stars all align, our next step is to produce the mask for the shuffle.
11689 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11690 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11691 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11692 SDValue Entry = Op.getOperand(i);
11693 if (Entry.isUndef())
11694 continue;
11695
11696 auto Src = find(Sources, Entry.getOperand(0));
11697 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11698
11699 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11700 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11701 // segment.
11702 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11703 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11704 VT.getScalarSizeInBits());
11705 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11706
11707 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11708 // starting at the appropriate offset.
11709 int *LaneMask = &Mask[i * ResMultiplier];
11710
11711 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11712 ExtractBase += NumElts * (Src - Sources.begin());
11713 for (int j = 0; j < LanesDefined; ++j)
11714 LaneMask[j] = ExtractBase + j;
11715 }
11716
11717 // Final check before we try to produce nonsense...
11718 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11719 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11720 return SDValue();
11721 }
11722
11723 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11724 for (unsigned i = 0; i < Sources.size(); ++i)
11725 ShuffleOps[i] = Sources[i].ShuffleVec;
11726
11727 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11728 ShuffleOps[1], Mask);
11729 SDValue V;
11730 if (DAG.getDataLayout().isBigEndian()) {
11731 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11732 } else {
11733 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11734 }
11735
11736 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11737 dbgs() << "Reshuffle, creating node: "; V.dump(););
11738
11739 return V;
11740}
11741
11742// check if an EXT instruction can handle the shuffle mask when the
11743// vector sources of the shuffle are the same.
11744static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11745 unsigned NumElts = VT.getVectorNumElements();
11746
11747 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11748 if (M[0] < 0)
11749 return false;
11750
11751 Imm = M[0];
11752
11753 // If this is a VEXT shuffle, the immediate value is the index of the first
11754 // element. The other shuffle indices must be the successive elements after
11755 // the first one.
11756 unsigned ExpectedElt = Imm;
11757 for (unsigned i = 1; i < NumElts; ++i) {
11758 // Increment the expected index. If it wraps around, just follow it
11759 // back to index zero and keep going.
11760 ++ExpectedElt;
11761 if (ExpectedElt == NumElts)
11762 ExpectedElt = 0;
11763
11764 if (M[i] < 0)
11765 continue; // ignore UNDEF indices
11766 if (ExpectedElt != static_cast<unsigned>(M[i]))
11767 return false;
11768 }
11769
11770 return true;
11771}
11772
11773// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11774// v4i32s. This is really a truncate, which we can construct out of (legal)
11775// concats and truncate nodes.
11777 if (V.getValueType() != MVT::v16i8)
11778 return SDValue();
11779 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11780
11781 for (unsigned X = 0; X < 4; X++) {
11782 // Check the first item in each group is an extract from lane 0 of a v4i32
11783 // or v4i16.
11784 SDValue BaseExt = V.getOperand(X * 4);
11785 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11786 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11787 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11788 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11789 BaseExt.getConstantOperandVal(1) != 0)
11790 return SDValue();
11791 SDValue Base = BaseExt.getOperand(0);
11792 // And check the other items are extracts from the same vector.
11793 for (unsigned Y = 1; Y < 4; Y++) {
11794 SDValue Ext = V.getOperand(X * 4 + Y);
11795 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11796 Ext.getOperand(0) != Base ||
11797 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11798 Ext.getConstantOperandVal(1) != Y)
11799 return SDValue();
11800 }
11801 }
11802
11803 // Turn the buildvector into a series of truncates and concates, which will
11804 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11805 // concat together to produce 2 v8i16. These are both truncated and concat
11806 // together.
11807 SDLoc DL(V);
11808 SDValue Trunc[4] = {
11809 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11810 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11811 for (SDValue &V : Trunc)
11812 if (V.getValueType() == MVT::v4i32)
11813 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11814 SDValue Concat0 =
11815 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11816 SDValue Concat1 =
11817 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11818 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11819 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11820 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11821}
11822
11823/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11824/// element width than the vector lane type. If that is the case the function
11825/// returns true and writes the value of the DUP instruction lane operand into
11826/// DupLaneOp
11827static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11828 unsigned &DupLaneOp) {
11829 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11830 "Only possible block sizes for wide DUP are: 16, 32, 64");
11831
11832 if (BlockSize <= VT.getScalarSizeInBits())
11833 return false;
11834 if (BlockSize % VT.getScalarSizeInBits() != 0)
11835 return false;
11836 if (VT.getSizeInBits() % BlockSize != 0)
11837 return false;
11838
11839 size_t SingleVecNumElements = VT.getVectorNumElements();
11840 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11841 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11842
11843 // We are looking for masks like
11844 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11845 // might be replaced by 'undefined'. BlockIndices will eventually contain
11846 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11847 // for the above examples)
11848 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11849 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11850 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11851 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11852 if (Elt < 0)
11853 continue;
11854 // For now we don't support shuffles that use the second operand
11855 if ((unsigned)Elt >= SingleVecNumElements)
11856 return false;
11857 if (BlockElts[I] < 0)
11858 BlockElts[I] = Elt;
11859 else if (BlockElts[I] != Elt)
11860 return false;
11861 }
11862
11863 // We found a candidate block (possibly with some undefs). It must be a
11864 // sequence of consecutive integers starting with a value divisible by
11865 // NumEltsPerBlock with some values possibly replaced by undef-s.
11866
11867 // Find first non-undef element
11868 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11869 assert(FirstRealEltIter != BlockElts.end() &&
11870 "Shuffle with all-undefs must have been caught by previous cases, "
11871 "e.g. isSplat()");
11872 if (FirstRealEltIter == BlockElts.end()) {
11873 DupLaneOp = 0;
11874 return true;
11875 }
11876
11877 // Index of FirstRealElt in BlockElts
11878 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11879
11880 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11881 return false;
11882 // BlockElts[0] must have the following value if it isn't undef:
11883 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11884
11885 // Check the first element
11886 if (Elt0 % NumEltsPerBlock != 0)
11887 return false;
11888 // Check that the sequence indeed consists of consecutive integers (modulo
11889 // undefs)
11890 for (size_t I = 0; I < NumEltsPerBlock; I++)
11891 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11892 return false;
11893
11894 DupLaneOp = Elt0 / NumEltsPerBlock;
11895 return true;
11896}
11897
11898// check if an EXT instruction can handle the shuffle mask when the
11899// vector sources of the shuffle are different.
11900static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11901 unsigned &Imm) {
11902 // Look for the first non-undef element.
11903 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11904
11905 // Benefit form APInt to handle overflow when calculating expected element.
11906 unsigned NumElts = VT.getVectorNumElements();
11907 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11908 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11909 // The following shuffle indices must be the successive elements after the
11910 // first real element.
11911 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11912 return Elt != ExpectedElt++ && Elt != -1;
11913 });
11914 if (FoundWrongElt)
11915 return false;
11916
11917 // The index of an EXT is the first element if it is not UNDEF.
11918 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11919 // value of the first element. E.g.
11920 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11921 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11922 // ExpectedElt is the last mask index plus 1.
11923 Imm = ExpectedElt.getZExtValue();
11924
11925 // There are two difference cases requiring to reverse input vectors.
11926 // For example, for vector <4 x i32> we have the following cases,
11927 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11928 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11929 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11930 // to reverse two input vectors.
11931 if (Imm < NumElts)
11932 ReverseEXT = true;
11933 else
11934 Imm -= NumElts;
11935
11936 return true;
11937}
11938
11939/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11940/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11941/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11942static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11943 unsigned NumElts = VT.getVectorNumElements();
11944 if (NumElts % 2 != 0)
11945 return false;
11946 WhichResult = (M[0] == 0 ? 0 : 1);
11947 unsigned Idx = WhichResult * NumElts / 2;
11948 for (unsigned i = 0; i != NumElts; i += 2) {
11949 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11950 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11951 return false;
11952 Idx += 1;
11953 }
11954
11955 return true;
11956}
11957
11958/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11959/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11960/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11961static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11962 unsigned Half = VT.getVectorNumElements() / 2;
11963 WhichResult = (M[0] == 0 ? 0 : 1);
11964 for (unsigned j = 0; j != 2; ++j) {
11965 unsigned Idx = WhichResult;
11966 for (unsigned i = 0; i != Half; ++i) {
11967 int MIdx = M[i + j * Half];
11968 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11969 return false;
11970 Idx += 2;
11971 }
11972 }
11973
11974 return true;
11975}
11976
11977/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11978/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11979/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11980static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11981 unsigned NumElts = VT.getVectorNumElements();
11982 if (NumElts % 2 != 0)
11983 return false;
11984 WhichResult = (M[0] == 0 ? 0 : 1);
11985 for (unsigned i = 0; i < NumElts; i += 2) {
11986 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11987 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11988 return false;
11989 }
11990 return true;
11991}
11992
11993static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11994 bool &DstIsLeft, int &Anomaly) {
11995 if (M.size() != static_cast<size_t>(NumInputElements))
11996 return false;
11997
11998 int NumLHSMatch = 0, NumRHSMatch = 0;
11999 int LastLHSMismatch = -1, LastRHSMismatch = -1;
12000
12001 for (int i = 0; i < NumInputElements; ++i) {
12002 if (M[i] == -1) {
12003 ++NumLHSMatch;
12004 ++NumRHSMatch;
12005 continue;
12006 }
12007
12008 if (M[i] == i)
12009 ++NumLHSMatch;
12010 else
12011 LastLHSMismatch = i;
12012
12013 if (M[i] == i + NumInputElements)
12014 ++NumRHSMatch;
12015 else
12016 LastRHSMismatch = i;
12017 }
12018
12019 if (NumLHSMatch == NumInputElements - 1) {
12020 DstIsLeft = true;
12021 Anomaly = LastLHSMismatch;
12022 return true;
12023 } else if (NumRHSMatch == NumInputElements - 1) {
12024 DstIsLeft = false;
12025 Anomaly = LastRHSMismatch;
12026 return true;
12027 }
12028
12029 return false;
12030}
12031
12032static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12033 if (VT.getSizeInBits() != 128)
12034 return false;
12035
12036 unsigned NumElts = VT.getVectorNumElements();
12037
12038 for (int I = 0, E = NumElts / 2; I != E; I++) {
12039 if (Mask[I] != I)
12040 return false;
12041 }
12042
12043 int Offset = NumElts / 2;
12044 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12045 if (Mask[I] != I + SplitLHS * Offset)
12046 return false;
12047 }
12048
12049 return true;
12050}
12051
12053 SDLoc DL(Op);
12054 EVT VT = Op.getValueType();
12055 SDValue V0 = Op.getOperand(0);
12056 SDValue V1 = Op.getOperand(1);
12057 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12058
12061 return SDValue();
12062
12063 bool SplitV0 = V0.getValueSizeInBits() == 128;
12064
12065 if (!isConcatMask(Mask, VT, SplitV0))
12066 return SDValue();
12067
12068 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12069 if (SplitV0) {
12070 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12071 DAG.getConstant(0, DL, MVT::i64));
12072 }
12073 if (V1.getValueSizeInBits() == 128) {
12074 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12075 DAG.getConstant(0, DL, MVT::i64));
12076 }
12077 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12078}
12079
12080/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12081/// the specified operations to build the shuffle. ID is the perfect-shuffle
12082//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12083//table entry and LHS/RHS are the immediate inputs for this stage of the
12084//shuffle.
12086 SDValue V2, unsigned PFEntry, SDValue LHS,
12087 SDValue RHS, SelectionDAG &DAG,
12088 const SDLoc &dl) {
12089 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12090 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12091 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12092
12093 enum {
12094 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12095 OP_VREV,
12096 OP_VDUP0,
12097 OP_VDUP1,
12098 OP_VDUP2,
12099 OP_VDUP3,
12100 OP_VEXT1,
12101 OP_VEXT2,
12102 OP_VEXT3,
12103 OP_VUZPL, // VUZP, left result
12104 OP_VUZPR, // VUZP, right result
12105 OP_VZIPL, // VZIP, left result
12106 OP_VZIPR, // VZIP, right result
12107 OP_VTRNL, // VTRN, left result
12108 OP_VTRNR, // VTRN, right result
12109 OP_MOVLANE // Move lane. RHSID is the lane to move into
12110 };
12111
12112 if (OpNum == OP_COPY) {
12113 if (LHSID == (1 * 9 + 2) * 9 + 3)
12114 return LHS;
12115 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12116 return RHS;
12117 }
12118
12119 if (OpNum == OP_MOVLANE) {
12120 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12121 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12122 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12123 Elt = 3 - Elt;
12124 while (Elt > 0) {
12125 ID /= 9;
12126 Elt--;
12127 }
12128 return (ID % 9 == 8) ? -1 : ID % 9;
12129 };
12130
12131 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12132 // get the lane to move from the PFID, which is always from the
12133 // original vectors (V1 or V2).
12135 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12136 EVT VT = OpLHS.getValueType();
12137 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12138 unsigned ExtLane = 0;
12139 SDValue Input;
12140
12141 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12142 // convert into a higher type.
12143 if (RHSID & 0x4) {
12144 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12145 if (MaskElt == -1)
12146 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12147 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12148 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12149 Input = MaskElt < 2 ? V1 : V2;
12150 if (VT.getScalarSizeInBits() == 16) {
12151 Input = DAG.getBitcast(MVT::v2f32, Input);
12152 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12153 } else {
12154 assert(VT.getScalarSizeInBits() == 32 &&
12155 "Expected 16 or 32 bit shuffle elemements");
12156 Input = DAG.getBitcast(MVT::v2f64, Input);
12157 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12158 }
12159 } else {
12160 int MaskElt = getPFIDLane(ID, RHSID);
12161 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12162 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12163 Input = MaskElt < 4 ? V1 : V2;
12164 // Be careful about creating illegal types. Use f16 instead of i16.
12165 if (VT == MVT::v4i16) {
12166 Input = DAG.getBitcast(MVT::v4f16, Input);
12167 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12168 }
12169 }
12172 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12173 SDValue Ins =
12174 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12175 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12176 return DAG.getBitcast(VT, Ins);
12177 }
12178
12179 SDValue OpLHS, OpRHS;
12180 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12181 RHS, DAG, dl);
12182 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12183 RHS, DAG, dl);
12184 EVT VT = OpLHS.getValueType();
12185
12186 switch (OpNum) {
12187 default:
12188 llvm_unreachable("Unknown shuffle opcode!");
12189 case OP_VREV:
12190 // VREV divides the vector in half and swaps within the half.
12191 if (VT.getVectorElementType() == MVT::i32 ||
12192 VT.getVectorElementType() == MVT::f32)
12193 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12194 // vrev <4 x i16> -> REV32
12195 if (VT.getVectorElementType() == MVT::i16 ||
12196 VT.getVectorElementType() == MVT::f16 ||
12197 VT.getVectorElementType() == MVT::bf16)
12198 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12199 // vrev <4 x i8> -> REV16
12200 assert(VT.getVectorElementType() == MVT::i8);
12201 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12202 case OP_VDUP0:
12203 case OP_VDUP1:
12204 case OP_VDUP2:
12205 case OP_VDUP3: {
12206 EVT EltTy = VT.getVectorElementType();
12207 unsigned Opcode;
12208 if (EltTy == MVT::i8)
12209 Opcode = AArch64ISD::DUPLANE8;
12210 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12211 Opcode = AArch64ISD::DUPLANE16;
12212 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12213 Opcode = AArch64ISD::DUPLANE32;
12214 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12215 Opcode = AArch64ISD::DUPLANE64;
12216 else
12217 llvm_unreachable("Invalid vector element type?");
12218
12219 if (VT.getSizeInBits() == 64)
12220 OpLHS = WidenVector(OpLHS, DAG);
12221 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12222 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12223 }
12224 case OP_VEXT1:
12225 case OP_VEXT2:
12226 case OP_VEXT3: {
12227 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12228 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12229 DAG.getConstant(Imm, dl, MVT::i32));
12230 }
12231 case OP_VUZPL:
12232 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12233 case OP_VUZPR:
12234 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12235 case OP_VZIPL:
12236 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12237 case OP_VZIPR:
12238 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12239 case OP_VTRNL:
12240 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12241 case OP_VTRNR:
12242 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12243 }
12244}
12245
12247 SelectionDAG &DAG) {
12248 // Check to see if we can use the TBL instruction.
12249 SDValue V1 = Op.getOperand(0);
12250 SDValue V2 = Op.getOperand(1);
12251 SDLoc DL(Op);
12252
12253 EVT EltVT = Op.getValueType().getVectorElementType();
12254 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12255
12256 bool Swap = false;
12257 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12258 std::swap(V1, V2);
12259 Swap = true;
12260 }
12261
12262 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12263 // out of range values with 0s. We do need to make sure that any out-of-range
12264 // values are really out-of-range for a v16i8 vector.
12265 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12266 MVT IndexVT = MVT::v8i8;
12267 unsigned IndexLen = 8;
12268 if (Op.getValueSizeInBits() == 128) {
12269 IndexVT = MVT::v16i8;
12270 IndexLen = 16;
12271 }
12272
12274 for (int Val : ShuffleMask) {
12275 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12276 unsigned Offset = Byte + Val * BytesPerElt;
12277 if (Swap)
12278 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12279 if (IsUndefOrZero && Offset >= IndexLen)
12280 Offset = 255;
12281 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12282 }
12283 }
12284
12285 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12286 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12287
12288 SDValue Shuffle;
12289 if (IsUndefOrZero) {
12290 if (IndexLen == 8)
12291 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12292 Shuffle = DAG.getNode(
12293 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12294 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12295 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12296 } else {
12297 if (IndexLen == 8) {
12298 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12299 Shuffle = DAG.getNode(
12300 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12301 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12302 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12303 } else {
12304 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12305 // cannot currently represent the register constraints on the input
12306 // table registers.
12307 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12308 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12309 // IndexLen));
12310 Shuffle = DAG.getNode(
12311 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12312 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12313 V2Cst,
12314 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12315 }
12316 }
12317 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12318}
12319
12320static unsigned getDUPLANEOp(EVT EltType) {
12321 if (EltType == MVT::i8)
12322 return AArch64ISD::DUPLANE8;
12323 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12324 return AArch64ISD::DUPLANE16;
12325 if (EltType == MVT::i32 || EltType == MVT::f32)
12326 return AArch64ISD::DUPLANE32;
12327 if (EltType == MVT::i64 || EltType == MVT::f64)
12328 return AArch64ISD::DUPLANE64;
12329
12330 llvm_unreachable("Invalid vector element type?");
12331}
12332
12333static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12334 unsigned Opcode, SelectionDAG &DAG) {
12335 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12336 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12337 // Match: dup (bitcast (extract_subv X, C)), LaneC
12338 if (BitCast.getOpcode() != ISD::BITCAST ||
12340 return false;
12341
12342 // The extract index must align in the destination type. That may not
12343 // happen if the bitcast is from narrow to wide type.
12344 SDValue Extract = BitCast.getOperand(0);
12345 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12346 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12347 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12348 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12349 if (ExtIdxInBits % CastedEltBitWidth != 0)
12350 return false;
12351
12352 // Can't handle cases where vector size is not 128-bit
12353 if (!Extract.getOperand(0).getValueType().is128BitVector())
12354 return false;
12355
12356 // Update the lane value by offsetting with the scaled extract index.
12357 LaneC += ExtIdxInBits / CastedEltBitWidth;
12358
12359 // Determine the casted vector type of the wide vector input.
12360 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12361 // Examples:
12362 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12363 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12364 unsigned SrcVecNumElts =
12365 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12367 SrcVecNumElts);
12368 return true;
12369 };
12370 MVT CastVT;
12371 if (getScaledOffsetDup(V, Lane, CastVT)) {
12372 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12373 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12374 V.getOperand(0).getValueType().is128BitVector()) {
12375 // The lane is incremented by the index of the extract.
12376 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12377 Lane += V.getConstantOperandVal(1);
12378 V = V.getOperand(0);
12379 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12380 // The lane is decremented if we are splatting from the 2nd operand.
12381 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12382 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12383 Lane -= Idx * VT.getVectorNumElements() / 2;
12384 V = WidenVector(V.getOperand(Idx), DAG);
12385 } else if (VT.getSizeInBits() == 64) {
12386 // Widen the operand to 128-bit register with undef.
12387 V = WidenVector(V, DAG);
12388 }
12389 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12390}
12391
12392// Return true if we can get a new shuffle mask by checking the parameter mask
12393// array to test whether every two adjacent mask values are continuous and
12394// starting from an even number.
12396 SmallVectorImpl<int> &NewMask) {
12397 unsigned NumElts = VT.getVectorNumElements();
12398 if (NumElts % 2 != 0)
12399 return false;
12400
12401 NewMask.clear();
12402 for (unsigned i = 0; i < NumElts; i += 2) {
12403 int M0 = M[i];
12404 int M1 = M[i + 1];
12405
12406 // If both elements are undef, new mask is undef too.
12407 if (M0 == -1 && M1 == -1) {
12408 NewMask.push_back(-1);
12409 continue;
12410 }
12411
12412 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12413 NewMask.push_back(M1 / 2);
12414 continue;
12415 }
12416
12417 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12418 NewMask.push_back(M0 / 2);
12419 continue;
12420 }
12421
12422 NewMask.clear();
12423 return false;
12424 }
12425
12426 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12427 return true;
12428}
12429
12430// Try to widen element type to get a new mask value for a better permutation
12431// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12432// UZP1/2, TRN1/2, REV, INS, etc.
12433// For example:
12434// shufflevector <4 x i32> %a, <4 x i32> %b,
12435// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12436// is equivalent to:
12437// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12438// Finally, we can get:
12439// mov v0.d[0], v1.d[1]
12441 SDLoc DL(Op);
12442 EVT VT = Op.getValueType();
12443 EVT ScalarVT = VT.getVectorElementType();
12444 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12445 SDValue V0 = Op.getOperand(0);
12446 SDValue V1 = Op.getOperand(1);
12447 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12448
12449 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12450 // We need to make sure the wider element type is legal. Thus, ElementSize
12451 // should be not larger than 32 bits, and i1 type should also be excluded.
12452 if (ElementSize > 32 || ElementSize == 1)
12453 return SDValue();
12454
12455 SmallVector<int, 8> NewMask;
12456 if (isWideTypeMask(Mask, VT, NewMask)) {
12457 MVT NewEltVT = VT.isFloatingPoint()
12458 ? MVT::getFloatingPointVT(ElementSize * 2)
12459 : MVT::getIntegerVT(ElementSize * 2);
12460 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12461 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12462 V0 = DAG.getBitcast(NewVT, V0);
12463 V1 = DAG.getBitcast(NewVT, V1);
12464 return DAG.getBitcast(VT,
12465 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12466 }
12467 }
12468
12469 return SDValue();
12470}
12471
12472// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12474 ArrayRef<int> ShuffleMask,
12475 SelectionDAG &DAG) {
12476 SDValue Tbl1 = Op->getOperand(0);
12477 SDValue Tbl2 = Op->getOperand(1);
12478 SDLoc dl(Op);
12479 SDValue Tbl2ID =
12480 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12481
12482 EVT VT = Op.getValueType();
12483 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12484 Tbl1->getOperand(0) != Tbl2ID ||
12486 Tbl2->getOperand(0) != Tbl2ID)
12487 return SDValue();
12488
12489 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12490 Tbl2->getValueType(0) != MVT::v16i8)
12491 return SDValue();
12492
12493 SDValue Mask1 = Tbl1->getOperand(3);
12494 SDValue Mask2 = Tbl2->getOperand(3);
12495 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12496 for (unsigned I = 0; I < 16; I++) {
12497 if (ShuffleMask[I] < 16)
12498 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12499 else {
12500 auto *C =
12501 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12502 if (!C)
12503 return SDValue();
12504 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12505 }
12506 }
12507
12508 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12509 SDValue ID =
12510 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12511
12512 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12513 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12514 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12515}
12516
12517// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12518// but we don't have an appropriate instruction,
12519// so custom-lower it as ZIP1-with-zeros.
12520SDValue
12521AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12522 SelectionDAG &DAG) const {
12523 SDLoc dl(Op);
12524 EVT VT = Op.getValueType();
12525 SDValue SrcOp = Op.getOperand(0);
12526 EVT SrcVT = SrcOp.getValueType();
12527 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12528 "Unexpected extension factor.");
12529 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12530 // FIXME: support multi-step zipping?
12531 if (Scale != 2)
12532 return SDValue();
12533 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12534 return DAG.getBitcast(VT,
12535 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12536}
12537
12538SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12539 SelectionDAG &DAG) const {
12540 SDLoc dl(Op);
12541 EVT VT = Op.getValueType();
12542
12543 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12544
12545 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12546 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12547
12548 // Convert shuffles that are directly supported on NEON to target-specific
12549 // DAG nodes, instead of keeping them as shuffles and matching them again
12550 // during code selection. This is more efficient and avoids the possibility
12551 // of inconsistencies between legalization and selection.
12552 ArrayRef<int> ShuffleMask = SVN->getMask();
12553
12554 SDValue V1 = Op.getOperand(0);
12555 SDValue V2 = Op.getOperand(1);
12556
12557 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12558 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12559 "Unexpected VECTOR_SHUFFLE mask size!");
12560
12561 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12562 return Res;
12563
12564 if (SVN->isSplat()) {
12565 int Lane = SVN->getSplatIndex();
12566 // If this is undef splat, generate it via "just" vdup, if possible.
12567 if (Lane == -1)
12568 Lane = 0;
12569
12570 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12571 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12572 V1.getOperand(0));
12573 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12574 // constant. If so, we can just reference the lane's definition directly.
12575 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12576 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12577 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12578
12579 // Otherwise, duplicate from the lane of the input vector.
12580 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12581 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12582 }
12583
12584 // Check if the mask matches a DUP for a wider element
12585 for (unsigned LaneSize : {64U, 32U, 16U}) {
12586 unsigned Lane = 0;
12587 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12588 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12589 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12591 // Cast V1 to an integer vector with required lane size
12592 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12593 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12594 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12595 V1 = DAG.getBitcast(NewVecTy, V1);
12596 // Constuct the DUP instruction
12597 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12598 // Cast back to the original type
12599 return DAG.getBitcast(VT, V1);
12600 }
12601 }
12602
12603 unsigned NumElts = VT.getVectorNumElements();
12604 unsigned EltSize = VT.getScalarSizeInBits();
12605 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
12606 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12607 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
12608 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12609 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
12610 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12611
12612 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
12613 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12614 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12615 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12616 DAG.getConstant(8, dl, MVT::i32));
12617 }
12618
12619 bool ReverseEXT = false;
12620 unsigned Imm;
12621 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12622 if (ReverseEXT)
12623 std::swap(V1, V2);
12624 Imm *= getExtFactor(V1);
12625 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12626 DAG.getConstant(Imm, dl, MVT::i32));
12627 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12628 Imm *= getExtFactor(V1);
12629 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12630 DAG.getConstant(Imm, dl, MVT::i32));
12631 }
12632
12633 unsigned WhichResult;
12634 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
12635 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12636 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12637 }
12638 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
12639 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12640 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12641 }
12642 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
12643 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12644 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12645 }
12646
12647 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12648 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12649 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12650 }
12651 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12652 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12653 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12654 }
12655 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12656 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12657 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12658 }
12659
12661 return Concat;
12662
12663 bool DstIsLeft;
12664 int Anomaly;
12665 int NumInputElements = V1.getValueType().getVectorNumElements();
12666 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12667 SDValue DstVec = DstIsLeft ? V1 : V2;
12668 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12669
12670 SDValue SrcVec = V1;
12671 int SrcLane = ShuffleMask[Anomaly];
12672 if (SrcLane >= NumInputElements) {
12673 SrcVec = V2;
12674 SrcLane -= NumElts;
12675 }
12676 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12677
12678 EVT ScalarVT = VT.getVectorElementType();
12679
12680 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12681 ScalarVT = MVT::i32;
12682
12683 return DAG.getNode(
12684 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12685 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12686 DstLaneV);
12687 }
12688
12689 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12690 return NewSD;
12691
12692 // If the shuffle is not directly supported and it has 4 elements, use
12693 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12694 if (NumElts == 4) {
12695 unsigned PFIndexes[4];
12696 for (unsigned i = 0; i != 4; ++i) {
12697 if (ShuffleMask[i] < 0)
12698 PFIndexes[i] = 8;
12699 else
12700 PFIndexes[i] = ShuffleMask[i];
12701 }
12702
12703 // Compute the index in the perfect shuffle table.
12704 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12705 PFIndexes[2] * 9 + PFIndexes[3];
12706 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12707 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12708 dl);
12709 }
12710
12711 return GenerateTBL(Op, ShuffleMask, DAG);
12712}
12713
12714SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12715 SelectionDAG &DAG) const {
12716 EVT VT = Op.getValueType();
12717
12718 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12719 return LowerToScalableOp(Op, DAG);
12720
12721 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12722 "Unexpected vector type!");
12723
12724 // We can handle the constant cases during isel.
12725 if (isa<ConstantSDNode>(Op.getOperand(0)))
12726 return Op;
12727
12728 // There isn't a natural way to handle the general i1 case, so we use some
12729 // trickery with whilelo.
12730 SDLoc DL(Op);
12731 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12732 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12733 DAG.getValueType(MVT::i1));
12734 SDValue ID =
12735 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12736 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12737 if (VT == MVT::nxv1i1)
12738 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12739 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12740 Zero, SplatVal),
12741 Zero);
12742 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12743}
12744
12745SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12746 SelectionDAG &DAG) const {
12747 SDLoc DL(Op);
12748
12749 EVT VT = Op.getValueType();
12750 if (!isTypeLegal(VT) || !VT.isScalableVector())
12751 return SDValue();
12752
12753 // Current lowering only supports the SVE-ACLE types.
12755 return SDValue();
12756
12757 // The DUPQ operation is indepedent of element type so normalise to i64s.
12758 SDValue Idx128 = Op.getOperand(2);
12759
12760 // DUPQ can be used when idx is in range.
12761 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12762 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12763 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12764 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12765 }
12766
12767 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12768
12769 // The ACLE says this must produce the same result as:
12770 // svtbl(data, svadd_x(svptrue_b64(),
12771 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12772 // index * 2))
12773 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12774 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12775
12776 // create the vector 0,1,0,1,...
12777 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12778 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12779
12780 // create the vector idx64,idx64+1,idx64,idx64+1,...
12781 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12782 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12783 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12784
12785 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12786 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12787 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12788}
12789
12790
12791static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12792 APInt &UndefBits) {
12793 EVT VT = BVN->getValueType(0);
12794 APInt SplatBits, SplatUndef;
12795 unsigned SplatBitSize;
12796 bool HasAnyUndefs;
12797 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12798 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12799
12800 for (unsigned i = 0; i < NumSplats; ++i) {
12801 CnstBits <<= SplatBitSize;
12802 UndefBits <<= SplatBitSize;
12803 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12804 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12805 }
12806
12807 return true;
12808 }
12809
12810 return false;
12811}
12812
12813// Try 64-bit splatted SIMD immediate.
12814static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12815 const APInt &Bits) {
12816 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12817 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12818 EVT VT = Op.getValueType();
12819 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12820
12823
12824 SDLoc dl(Op);
12825 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12826 DAG.getConstant(Value, dl, MVT::i32));
12827 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12828 }
12829 }
12830
12831 return SDValue();
12832}
12833
12834// Try 32-bit splatted SIMD immediate.
12835static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12836 const APInt &Bits,
12837 const SDValue *LHS = nullptr) {
12838 EVT VT = Op.getValueType();
12839 if (VT.isFixedLengthVector() &&
12841 return SDValue();
12842
12843 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12844 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12845 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12846 bool isAdvSIMDModImm = false;
12847 uint64_t Shift;
12848
12849 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12851 Shift = 0;
12852 }
12853 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12855 Shift = 8;
12856 }
12857 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12859 Shift = 16;
12860 }
12861 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12863 Shift = 24;
12864 }
12865
12866 if (isAdvSIMDModImm) {
12867 SDLoc dl(Op);
12868 SDValue Mov;
12869
12870 if (LHS)
12871 Mov = DAG.getNode(NewOp, dl, MovTy,
12872 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12873 DAG.getConstant(Value, dl, MVT::i32),
12874 DAG.getConstant(Shift, dl, MVT::i32));
12875 else
12876 Mov = DAG.getNode(NewOp, dl, MovTy,
12877 DAG.getConstant(Value, dl, MVT::i32),
12878 DAG.getConstant(Shift, dl, MVT::i32));
12879
12880 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12881 }
12882 }
12883
12884 return SDValue();
12885}
12886
12887// Try 16-bit splatted SIMD immediate.
12888static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12889 const APInt &Bits,
12890 const SDValue *LHS = nullptr) {
12891 EVT VT = Op.getValueType();
12892 if (VT.isFixedLengthVector() &&
12894 return SDValue();
12895
12896 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12897 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12898 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12899 bool isAdvSIMDModImm = false;
12900 uint64_t Shift;
12901
12902 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12904 Shift = 0;
12905 }
12906 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12908 Shift = 8;
12909 }
12910
12911 if (isAdvSIMDModImm) {
12912 SDLoc dl(Op);
12913 SDValue Mov;
12914
12915 if (LHS)
12916 Mov = DAG.getNode(NewOp, dl, MovTy,
12917 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12918 DAG.getConstant(Value, dl, MVT::i32),
12919 DAG.getConstant(Shift, dl, MVT::i32));
12920 else
12921 Mov = DAG.getNode(NewOp, dl, MovTy,
12922 DAG.getConstant(Value, dl, MVT::i32),
12923 DAG.getConstant(Shift, dl, MVT::i32));
12924
12925 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12926 }
12927 }
12928
12929 return SDValue();
12930}
12931
12932// Try 32-bit splatted SIMD immediate with shifted ones.
12934 SelectionDAG &DAG, const APInt &Bits) {
12935 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12936 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12937 EVT VT = Op.getValueType();
12938 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12939 bool isAdvSIMDModImm = false;
12940 uint64_t Shift;
12941
12942 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12944 Shift = 264;
12945 }
12946 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12948 Shift = 272;
12949 }
12950
12951 if (isAdvSIMDModImm) {
12952 SDLoc dl(Op);
12953 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12954 DAG.getConstant(Value, dl, MVT::i32),
12955 DAG.getConstant(Shift, dl, MVT::i32));
12956 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12957 }
12958 }
12959
12960 return SDValue();
12961}
12962
12963// Try 8-bit splatted SIMD immediate.
12964static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12965 const APInt &Bits) {
12966 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12967 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12968 EVT VT = Op.getValueType();
12969 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12970
12973
12974 SDLoc dl(Op);
12975 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12976 DAG.getConstant(Value, dl, MVT::i32));
12977 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12978 }
12979 }
12980
12981 return SDValue();
12982}
12983
12984// Try FP splatted SIMD immediate.
12985static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12986 const APInt &Bits) {
12987 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12988 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12989 EVT VT = Op.getValueType();
12990 bool isWide = (VT.getSizeInBits() == 128);
12991 MVT MovTy;
12992 bool isAdvSIMDModImm = false;
12993
12994 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12996 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12997 }
12998 else if (isWide &&
12999 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
13001 MovTy = MVT::v2f64;
13002 }
13003
13004 if (isAdvSIMDModImm) {
13005 SDLoc dl(Op);
13006 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13007 DAG.getConstant(Value, dl, MVT::i32));
13008 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13009 }
13010 }
13011
13012 return SDValue();
13013}
13014
13015// Specialized code to quickly find if PotentialBVec is a BuildVector that
13016// consists of only the same constant int value, returned in reference arg
13017// ConstVal
13018static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13019 uint64_t &ConstVal) {
13020 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13021 if (!Bvec)
13022 return false;
13023 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13024 if (!FirstElt)
13025 return false;
13026 EVT VT = Bvec->getValueType(0);
13027 unsigned NumElts = VT.getVectorNumElements();
13028 for (unsigned i = 1; i < NumElts; ++i)
13029 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13030 return false;
13031 ConstVal = FirstElt->getZExtValue();
13032 return true;
13033}
13034
13036 // Look through cast.
13037 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13038 N = N.getOperand(0);
13039
13040 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13041}
13042
13044 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13045
13046 // Look through cast.
13047 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13048 N = N.getOperand(0);
13049 // When reinterpreting from a type with fewer elements the "new" elements
13050 // are not active, so bail if they're likely to be used.
13051 if (N.getValueType().getVectorMinNumElements() < NumElts)
13052 return false;
13053 }
13054
13055 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13056 return true;
13057
13058 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13059 // or smaller than the implicit element type represented by N.
13060 // NOTE: A larger element count implies a smaller element type.
13061 if (N.getOpcode() == AArch64ISD::PTRUE &&
13062 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13063 return N.getValueType().getVectorMinNumElements() >= NumElts;
13064
13065 // If we're compiling for a specific vector-length, we can check if the
13066 // pattern's VL equals that of the scalable vector at runtime.
13067 if (N.getOpcode() == AArch64ISD::PTRUE) {
13068 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13069 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13070 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13071 if (MaxSVESize && MinSVESize == MaxSVESize) {
13072 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13073 unsigned PatNumElts =
13074 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13075 return PatNumElts == (NumElts * VScale);
13076 }
13077 }
13078
13079 return false;
13080}
13081
13082// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13083// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13084// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13085// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13086// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13087// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13089 EVT VT = N->getValueType(0);
13090
13091 if (!VT.isVector())
13092 return SDValue();
13093
13094 SDLoc DL(N);
13095
13096 SDValue And;
13097 SDValue Shift;
13098
13099 SDValue FirstOp = N->getOperand(0);
13100 unsigned FirstOpc = FirstOp.getOpcode();
13101 SDValue SecondOp = N->getOperand(1);
13102 unsigned SecondOpc = SecondOp.getOpcode();
13103
13104 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13105 // a BICi in order to use an immediate instead of a register.
13106 // Is the other operand an shl or lshr? This will have been turned into:
13107 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13108 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13109 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13110 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13111 SecondOpc == AArch64ISD::SHL_PRED ||
13112 SecondOpc == AArch64ISD::SRL_PRED)) {
13113 And = FirstOp;
13114 Shift = SecondOp;
13115
13116 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13117 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13118 FirstOpc == AArch64ISD::SHL_PRED ||
13119 FirstOpc == AArch64ISD::SRL_PRED)) {
13120 And = SecondOp;
13121 Shift = FirstOp;
13122 } else
13123 return SDValue();
13124
13125 bool IsAnd = And.getOpcode() == ISD::AND;
13126 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13128 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13130
13131 // Is the shift amount constant and are all lanes active?
13132 uint64_t C2;
13133 if (ShiftHasPredOp) {
13134 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13135 return SDValue();
13136 APInt C;
13138 return SDValue();
13139 C2 = C.getZExtValue();
13140 } else if (ConstantSDNode *C2node =
13141 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13142 C2 = C2node->getZExtValue();
13143 else
13144 return SDValue();
13145
13146 APInt C1AsAPInt;
13147 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13148 if (IsAnd) {
13149 // Is the and mask vector all constant?
13150 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13151 return SDValue();
13152 } else {
13153 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13154 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13155 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13156 assert(C1nodeImm && C1nodeShift);
13157 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13158 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13159 }
13160
13161 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13162 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13163 // how much one can shift elements of a particular size?
13164 if (C2 > ElemSizeInBits)
13165 return SDValue();
13166
13167 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13168 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13169 if (C1AsAPInt != RequiredC1)
13170 return SDValue();
13171
13172 SDValue X = And.getOperand(0);
13173 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13174 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13175 : Shift.getOperand(1);
13176
13177 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13178 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13179
13180 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13181 LLVM_DEBUG(N->dump(&DAG));
13182 LLVM_DEBUG(dbgs() << "into: \n");
13183 LLVM_DEBUG(ResultSLI->dump(&DAG));
13184
13185 ++NumShiftInserts;
13186 return ResultSLI;
13187}
13188
13189SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13190 SelectionDAG &DAG) const {
13191 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13192 !Subtarget->isNeonAvailable()))
13193 return LowerToScalableOp(Op, DAG);
13194
13195 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13196 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13197 return Res;
13198
13199 EVT VT = Op.getValueType();
13200 if (VT.isScalableVector())
13201 return Op;
13202
13203 SDValue LHS = Op.getOperand(0);
13204 BuildVectorSDNode *BVN =
13205 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13206 if (!BVN) {
13207 // OR commutes, so try swapping the operands.
13208 LHS = Op.getOperand(1);
13209 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13210 }
13211 if (!BVN)
13212 return Op;
13213
13214 APInt DefBits(VT.getSizeInBits(), 0);
13215 APInt UndefBits(VT.getSizeInBits(), 0);
13216 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13217 SDValue NewOp;
13218
13219 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13220 DefBits, &LHS)) ||
13221 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13222 DefBits, &LHS)))
13223 return NewOp;
13224
13225 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13226 UndefBits, &LHS)) ||
13227 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13228 UndefBits, &LHS)))
13229 return NewOp;
13230 }
13231
13232 // We can always fall back to a non-immediate OR.
13233 return Op;
13234}
13235
13236// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13237// be truncated to fit element width.
13239 SelectionDAG &DAG) {
13240 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13241 SDLoc dl(Op);
13242 EVT VT = Op.getValueType();
13243 EVT EltTy= VT.getVectorElementType();
13244
13245 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13246 return Op;
13247
13249 for (SDValue Lane : Op->ops()) {
13250 // For integer vectors, type legalization would have promoted the
13251 // operands already. Otherwise, if Op is a floating-point splat
13252 // (with operands cast to integers), then the only possibilities
13253 // are constants and UNDEFs.
13254 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13255 APInt LowBits(EltTy.getSizeInBits(),
13256 CstLane->getZExtValue());
13257 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13258 } else if (Lane.getNode()->isUndef()) {
13259 Lane = DAG.getUNDEF(MVT::i32);
13260 } else {
13261 assert(Lane.getValueType() == MVT::i32 &&
13262 "Unexpected BUILD_VECTOR operand type");
13263 }
13264 Ops.push_back(Lane);
13265 }
13266 return DAG.getBuildVector(VT, dl, Ops);
13267}
13268
13270 const AArch64Subtarget *ST) {
13271 EVT VT = Op.getValueType();
13272 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13273 "Expected a legal NEON vector");
13274
13275 APInt DefBits(VT.getSizeInBits(), 0);
13276 APInt UndefBits(VT.getSizeInBits(), 0);
13277 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13278 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13279 auto TryMOVIWithBits = [&](APInt DefBits) {
13280 SDValue NewOp;
13281 if ((NewOp =
13282 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13283 (NewOp =
13284 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13285 (NewOp =
13286 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13287 (NewOp =
13288 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13289 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13290 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13291 return NewOp;
13292
13293 APInt NotDefBits = ~DefBits;
13294 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13295 NotDefBits)) ||
13297 NotDefBits)) ||
13298 (NewOp =
13299 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13300 return NewOp;
13301 return SDValue();
13302 };
13303 if (SDValue R = TryMOVIWithBits(DefBits))
13304 return R;
13305 if (SDValue R = TryMOVIWithBits(UndefBits))
13306 return R;
13307
13308 // See if a fneg of the constant can be materialized with a MOVI, etc
13309 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13310 // FNegate each sub-element of the constant
13311 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13312 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13313 .zext(VT.getSizeInBits());
13314 APInt NegBits(VT.getSizeInBits(), 0);
13315 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13316 for (unsigned i = 0; i < NumElts; i++)
13317 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13318 NegBits = DefBits ^ NegBits;
13319
13320 // Try to create the new constants with MOVI, and if so generate a fneg
13321 // for it.
13322 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13323 SDLoc DL(Op);
13324 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13325 return DAG.getNode(
13327 DAG.getNode(ISD::FNEG, DL, VFVT,
13328 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13329 }
13330 return SDValue();
13331 };
13332 SDValue R;
13333 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13334 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13335 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13336 return R;
13337 }
13338
13339 return SDValue();
13340}
13341
13342SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13343 SelectionDAG &DAG) const {
13344 EVT VT = Op.getValueType();
13345
13346 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13347 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13348 SDLoc DL(Op);
13349 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13350 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13351 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13352 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13353 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13354 }
13355
13356 // Revert to common legalisation for all other variants.
13357 return SDValue();
13358 }
13359
13360 // Try to build a simple constant vector.
13361 Op = NormalizeBuildVector(Op, DAG);
13362 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13363 // abort.
13364 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13365 return SDValue();
13366
13367 // Certain vector constants, used to express things like logical NOT and
13368 // arithmetic NEG, are passed through unmodified. This allows special
13369 // patterns for these operations to match, which will lower these constants
13370 // to whatever is proven necessary.
13371 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13372 if (BVN->isConstant()) {
13373 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13374 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13375 APInt Val(BitSize,
13376 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13377 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13378 return Op;
13379 }
13380 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13381 if (Const->isZero() && !Const->isNegative())
13382 return Op;
13383 }
13384
13385 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13386 return V;
13387
13388 // Scan through the operands to find some interesting properties we can
13389 // exploit:
13390 // 1) If only one value is used, we can use a DUP, or
13391 // 2) if only the low element is not undef, we can just insert that, or
13392 // 3) if only one constant value is used (w/ some non-constant lanes),
13393 // we can splat the constant value into the whole vector then fill
13394 // in the non-constant lanes.
13395 // 4) FIXME: If different constant values are used, but we can intelligently
13396 // select the values we'll be overwriting for the non-constant
13397 // lanes such that we can directly materialize the vector
13398 // some other way (MOVI, e.g.), we can be sneaky.
13399 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13400 SDLoc dl(Op);
13401 unsigned NumElts = VT.getVectorNumElements();
13402 bool isOnlyLowElement = true;
13403 bool usesOnlyOneValue = true;
13404 bool usesOnlyOneConstantValue = true;
13405 bool isConstant = true;
13406 bool AllLanesExtractElt = true;
13407 unsigned NumConstantLanes = 0;
13408 unsigned NumDifferentLanes = 0;
13409 unsigned NumUndefLanes = 0;
13410 SDValue Value;
13411 SDValue ConstantValue;
13412 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13413 unsigned ConsecutiveValCount = 0;
13414 SDValue PrevVal;
13415 for (unsigned i = 0; i < NumElts; ++i) {
13416 SDValue V = Op.getOperand(i);
13417 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13418 AllLanesExtractElt = false;
13419 if (V.isUndef()) {
13420 ++NumUndefLanes;
13421 continue;
13422 }
13423 if (i > 0)
13424 isOnlyLowElement = false;
13425 if (!isIntOrFPConstant(V))
13426 isConstant = false;
13427
13428 if (isIntOrFPConstant(V)) {
13429 ++NumConstantLanes;
13430 if (!ConstantValue.getNode())
13431 ConstantValue = V;
13432 else if (ConstantValue != V)
13433 usesOnlyOneConstantValue = false;
13434 }
13435
13436 if (!Value.getNode())
13437 Value = V;
13438 else if (V != Value) {
13439 usesOnlyOneValue = false;
13440 ++NumDifferentLanes;
13441 }
13442
13443 if (PrevVal != V) {
13444 ConsecutiveValCount = 0;
13445 PrevVal = V;
13446 }
13447
13448 // Keep different values and its last consecutive count. For example,
13449 //
13450 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13451 // t24, t24, t24, t24, t24, t24, t24, t24
13452 // t23 = consecutive count 8
13453 // t24 = consecutive count 8
13454 // ------------------------------------------------------------------
13455 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13456 // t24, t24, t24, t24, t24, t24, t24, t24
13457 // t23 = consecutive count 5
13458 // t24 = consecutive count 9
13459 DifferentValueMap[V] = ++ConsecutiveValCount;
13460 }
13461
13462 if (!Value.getNode()) {
13463 LLVM_DEBUG(
13464 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13465 return DAG.getUNDEF(VT);
13466 }
13467
13468 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13469 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13470 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13471 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13472 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13473 "SCALAR_TO_VECTOR node\n");
13474 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13475 }
13476
13477 if (AllLanesExtractElt) {
13478 SDNode *Vector = nullptr;
13479 bool Even = false;
13480 bool Odd = false;
13481 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13482 // the Odd pattern <1,3,5,...>.
13483 for (unsigned i = 0; i < NumElts; ++i) {
13484 SDValue V = Op.getOperand(i);
13485 const SDNode *N = V.getNode();
13486 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13487 Even = false;
13488 Odd = false;
13489 break;
13490 }
13491 SDValue N0 = N->getOperand(0);
13492
13493 // All elements are extracted from the same vector.
13494 if (!Vector) {
13495 Vector = N0.getNode();
13496 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13497 // BUILD_VECTOR.
13498 if (VT.getVectorElementType() !=
13500 break;
13501 } else if (Vector != N0.getNode()) {
13502 Odd = false;
13503 Even = false;
13504 break;
13505 }
13506
13507 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13508 // indices <1,3,5,...>.
13509 uint64_t Val = N->getConstantOperandVal(1);
13510 if (Val == 2 * i) {
13511 Even = true;
13512 continue;
13513 }
13514 if (Val - 1 == 2 * i) {
13515 Odd = true;
13516 continue;
13517 }
13518
13519 // Something does not match: abort.
13520 Odd = false;
13521 Even = false;
13522 break;
13523 }
13524 if (Even || Odd) {
13525 SDValue LHS =
13527 DAG.getConstant(0, dl, MVT::i64));
13528 SDValue RHS =
13530 DAG.getConstant(NumElts, dl, MVT::i64));
13531
13532 if (Even && !Odd)
13533 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13534 RHS);
13535 if (Odd && !Even)
13536 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13537 RHS);
13538 }
13539 }
13540
13541 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13542 // i32 and try again.
13543 if (usesOnlyOneValue) {
13544 if (!isConstant) {
13545 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13546 Value.getValueType() != VT) {
13547 LLVM_DEBUG(
13548 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13549 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13550 }
13551
13552 // This is actually a DUPLANExx operation, which keeps everything vectory.
13553
13554 SDValue Lane = Value.getOperand(1);
13555 Value = Value.getOperand(0);
13556 if (Value.getValueSizeInBits() == 64) {
13557 LLVM_DEBUG(
13558 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13559 "widening it\n");
13560 Value = WidenVector(Value, DAG);
13561 }
13562
13563 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13564 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13565 }
13566
13569 EVT EltTy = VT.getVectorElementType();
13570 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13571 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13572 LLVM_DEBUG(
13573 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13574 "BITCASTS, and try again\n");
13575 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13576 for (unsigned i = 0; i < NumElts; ++i)
13577 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13578 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13579 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13580 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13581 Val.dump(););
13582 Val = LowerBUILD_VECTOR(Val, DAG);
13583 if (Val.getNode())
13584 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13585 }
13586 }
13587
13588 // If we need to insert a small number of different non-constant elements and
13589 // the vector width is sufficiently large, prefer using DUP with the common
13590 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13591 // skip the constant lane handling below.
13592 bool PreferDUPAndInsert =
13593 !isConstant && NumDifferentLanes >= 1 &&
13594 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13595 NumDifferentLanes >= NumConstantLanes;
13596
13597 // If there was only one constant value used and for more than one lane,
13598 // start by splatting that value, then replace the non-constant lanes. This
13599 // is better than the default, which will perform a separate initialization
13600 // for each lane.
13601 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13602 // Firstly, try to materialize the splat constant.
13603 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13604 unsigned BitSize = VT.getScalarSizeInBits();
13605 APInt ConstantValueAPInt(1, 0);
13606 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13607 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13608 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13609 !ConstantValueAPInt.isAllOnes()) {
13610 Val = ConstantBuildVector(Val, DAG, Subtarget);
13611 if (!Val)
13612 // Otherwise, materialize the constant and splat it.
13613 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13614 }
13615
13616 // Now insert the non-constant lanes.
13617 for (unsigned i = 0; i < NumElts; ++i) {
13618 SDValue V = Op.getOperand(i);
13619 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13620 if (!isIntOrFPConstant(V))
13621 // Note that type legalization likely mucked about with the VT of the
13622 // source operand, so we may have to convert it here before inserting.
13623 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13624 }
13625 return Val;
13626 }
13627
13628 // This will generate a load from the constant pool.
13629 if (isConstant) {
13630 LLVM_DEBUG(
13631 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13632 "expansion\n");
13633 return SDValue();
13634 }
13635
13636 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13637 // v4i32s. This is really a truncate, which we can construct out of (legal)
13638 // concats and truncate nodes.
13640 return M;
13641
13642 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13643 if (NumElts >= 4) {
13644 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13645 return Shuffle;
13646
13647 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13648 return Shuffle;
13649 }
13650
13651 if (PreferDUPAndInsert) {
13652 // First, build a constant vector with the common element.
13653 SmallVector<SDValue, 8> Ops(NumElts, Value);
13654 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13655 // Next, insert the elements that do not match the common value.
13656 for (unsigned I = 0; I < NumElts; ++I)
13657 if (Op.getOperand(I) != Value)
13658 NewVector =
13659 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13660 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13661
13662 return NewVector;
13663 }
13664
13665 // If vector consists of two different values, try to generate two DUPs and
13666 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13667 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13669 // Check the consecutive count of the value is the half number of vector
13670 // elements. In this case, we can use CONCAT_VECTORS. For example,
13671 //
13672 // canUseVECTOR_CONCAT = true;
13673 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13674 // t24, t24, t24, t24, t24, t24, t24, t24
13675 //
13676 // canUseVECTOR_CONCAT = false;
13677 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13678 // t24, t24, t24, t24, t24, t24, t24, t24
13679 bool canUseVECTOR_CONCAT = true;
13680 for (auto Pair : DifferentValueMap) {
13681 // Check different values have same length which is NumElts / 2.
13682 if (Pair.second != NumElts / 2)
13683 canUseVECTOR_CONCAT = false;
13684 Vals.push_back(Pair.first);
13685 }
13686
13687 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13688 // CONCAT_VECTORs. For example,
13689 //
13690 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13691 // t24, t24, t24, t24, t24, t24, t24, t24
13692 // ==>
13693 // t26: v8i8 = AArch64ISD::DUP t23
13694 // t28: v8i8 = AArch64ISD::DUP t24
13695 // t29: v16i8 = concat_vectors t26, t28
13696 if (canUseVECTOR_CONCAT) {
13697 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13698 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13699 SubVT.getVectorNumElements() >= 2) {
13700 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13701 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13702 SDValue DUP1 =
13703 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13704 SDValue DUP2 =
13705 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13707 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13708 return CONCAT_VECTORS;
13709 }
13710 }
13711
13712 // Let's try to generate VECTOR_SHUFFLE. For example,
13713 //
13714 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13715 // ==>
13716 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13717 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13718 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13719 if (NumElts >= 8) {
13720 SmallVector<int, 16> MaskVec;
13721 // Build mask for VECTOR_SHUFLLE.
13722 SDValue FirstLaneVal = Op.getOperand(0);
13723 for (unsigned i = 0; i < NumElts; ++i) {
13724 SDValue Val = Op.getOperand(i);
13725 if (FirstLaneVal == Val)
13726 MaskVec.push_back(i);
13727 else
13728 MaskVec.push_back(i + NumElts);
13729 }
13730
13731 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13732 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13733 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13734 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13736 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13737 return VECTOR_SHUFFLE;
13738 }
13739 }
13740
13741 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13742 // know the default expansion would otherwise fall back on something even
13743 // worse. For a vector with one or two non-undef values, that's
13744 // scalar_to_vector for the elements followed by a shuffle (provided the
13745 // shuffle is valid for the target) and materialization element by element
13746 // on the stack followed by a load for everything else.
13747 if (!isConstant && !usesOnlyOneValue) {
13748 LLVM_DEBUG(
13749 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13750 "of INSERT_VECTOR_ELT\n");
13751
13752 SDValue Vec = DAG.getUNDEF(VT);
13753 SDValue Op0 = Op.getOperand(0);
13754 unsigned i = 0;
13755
13756 // Use SCALAR_TO_VECTOR for lane zero to
13757 // a) Avoid a RMW dependency on the full vector register, and
13758 // b) Allow the register coalescer to fold away the copy if the
13759 // value is already in an S or D register, and we're forced to emit an
13760 // INSERT_SUBREG that we can't fold anywhere.
13761 //
13762 // We also allow types like i8 and i16 which are illegal scalar but legal
13763 // vector element types. After type-legalization the inserted value is
13764 // extended (i32) and it is safe to cast them to the vector type by ignoring
13765 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13766 if (!Op0.isUndef()) {
13767 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13768 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13769 ++i;
13770 }
13771 LLVM_DEBUG(if (i < NumElts) dbgs()
13772 << "Creating nodes for the other vector elements:\n";);
13773 for (; i < NumElts; ++i) {
13774 SDValue V = Op.getOperand(i);
13775 if (V.isUndef())
13776 continue;
13777 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13778 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13779 }
13780 return Vec;
13781 }
13782
13783 LLVM_DEBUG(
13784 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13785 "better alternative\n");
13786 return SDValue();
13787}
13788
13789SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13790 SelectionDAG &DAG) const {
13791 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13792 !Subtarget->isNeonAvailable()))
13793 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13794
13795 assert(Op.getValueType().isScalableVector() &&
13796 isTypeLegal(Op.getValueType()) &&
13797 "Expected legal scalable vector type!");
13798
13799 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13800 unsigned NumOperands = Op->getNumOperands();
13801 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13802 "Unexpected number of operands in CONCAT_VECTORS");
13803
13804 if (NumOperands == 2)
13805 return Op;
13806
13807 // Concat each pair of subvectors and pack into the lower half of the array.
13808 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13809 while (ConcatOps.size() > 1) {
13810 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13811 SDValue V1 = ConcatOps[I];
13812 SDValue V2 = ConcatOps[I + 1];
13813 EVT SubVT = V1.getValueType();
13814 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13815 ConcatOps[I / 2] =
13816 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13817 }
13818 ConcatOps.resize(ConcatOps.size() / 2);
13819 }
13820 return ConcatOps[0];
13821 }
13822
13823 return SDValue();
13824}
13825
13826SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13827 SelectionDAG &DAG) const {
13828 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13829
13830 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13831 !Subtarget->isNeonAvailable()))
13832 return LowerFixedLengthInsertVectorElt(Op, DAG);
13833
13834 EVT VT = Op.getOperand(0).getValueType();
13835
13836 if (VT.getScalarType() == MVT::i1) {
13837 EVT VectorVT = getPromotedVTForPredicate(VT);
13838 SDLoc DL(Op);
13839 SDValue ExtendedVector =
13840 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13841 SDValue ExtendedValue =
13842 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13843 VectorVT.getScalarType().getSizeInBits() < 32
13844 ? MVT::i32
13845 : VectorVT.getScalarType());
13846 ExtendedVector =
13847 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13848 ExtendedValue, Op.getOperand(2));
13849 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13850 }
13851
13852 // Check for non-constant or out of range lane.
13853 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13854 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13855 return SDValue();
13856
13857 return Op;
13858}
13859
13860SDValue
13861AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13862 SelectionDAG &DAG) const {
13863 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13864 EVT VT = Op.getOperand(0).getValueType();
13865
13866 if (VT.getScalarType() == MVT::i1) {
13867 // We can't directly extract from an SVE predicate; extend it first.
13868 // (This isn't the only possible lowering, but it's straightforward.)
13869 EVT VectorVT = getPromotedVTForPredicate(VT);
13870 SDLoc DL(Op);
13871 SDValue Extend =
13872 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13873 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13874 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13875 Extend, Op.getOperand(1));
13876 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13877 }
13878
13879 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13880 return LowerFixedLengthExtractVectorElt(Op, DAG);
13881
13882 // Check for non-constant or out of range lane.
13883 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13884 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13885 return SDValue();
13886
13887 // Insertion/extraction are legal for V128 types.
13888 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13889 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13890 VT == MVT::v8f16 || VT == MVT::v8bf16)
13891 return Op;
13892
13893 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13894 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13895 VT != MVT::v4bf16)
13896 return SDValue();
13897
13898 // For V64 types, we perform extraction by expanding the value
13899 // to a V128 type and perform the extraction on that.
13900 SDLoc DL(Op);
13901 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13902 EVT WideTy = WideVec.getValueType();
13903
13904 EVT ExtrTy = WideTy.getVectorElementType();
13905 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13906 ExtrTy = MVT::i32;
13907
13908 // For extractions, we just return the result directly.
13909 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13910 Op.getOperand(1));
13911}
13912
13913SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13914 SelectionDAG &DAG) const {
13915 EVT VT = Op.getValueType();
13917 "Only cases that extract a fixed length vector are supported!");
13918 EVT InVT = Op.getOperand(0).getValueType();
13919
13920 // If we don't have legal types yet, do nothing
13921 if (!isTypeLegal(InVT))
13922 return SDValue();
13923
13924 if (InVT.is128BitVector()) {
13925 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
13926 unsigned Idx = Op.getConstantOperandVal(1);
13927
13928 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13929 if (Idx == 0)
13930 return Op;
13931
13932 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13933 // that directly.
13934 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
13935 return Op;
13936 }
13937
13938 if (InVT.isScalableVector() ||
13939 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13940 SDLoc DL(Op);
13941 SDValue Vec = Op.getOperand(0);
13942 SDValue Idx = Op.getOperand(1);
13943
13945 if (PackedVT != InVT) {
13946 // Pack input into the bottom part of an SVE register and try again.
13947 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
13948 DAG.getUNDEF(PackedVT), Vec,
13949 DAG.getVectorIdxConstant(0, DL));
13950 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
13951 }
13952
13953 // This will get matched by custom code during ISelDAGToDAG.
13954 if (isNullConstant(Idx))
13955 return Op;
13956
13957 assert(InVT.isScalableVector() && "Unexpected vector type!");
13958 // Move requested subvector to the start of the vector and try again.
13959 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
13960 return convertFromScalableVector(DAG, VT, Splice);
13961 }
13962
13963 return SDValue();
13964}
13965
13966SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13967 SelectionDAG &DAG) const {
13968 assert(Op.getValueType().isScalableVector() &&
13969 "Only expect to lower inserts into scalable vectors!");
13970
13971 EVT InVT = Op.getOperand(1).getValueType();
13972 unsigned Idx = Op.getConstantOperandVal(2);
13973
13974 SDValue Vec0 = Op.getOperand(0);
13975 SDValue Vec1 = Op.getOperand(1);
13976 SDLoc DL(Op);
13977 EVT VT = Op.getValueType();
13978
13979 if (InVT.isScalableVector()) {
13980 if (!isTypeLegal(VT))
13981 return SDValue();
13982
13983 // Break down insert_subvector into simpler parts.
13984 if (VT.getVectorElementType() == MVT::i1) {
13985 unsigned NumElts = VT.getVectorMinNumElements();
13986 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13987
13988 SDValue Lo, Hi;
13989 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13990 DAG.getVectorIdxConstant(0, DL));
13991 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13992 DAG.getVectorIdxConstant(NumElts / 2, DL));
13993 if (Idx < (NumElts / 2))
13994 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13996 else
13997 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13998 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13999
14000 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14001 }
14002
14003 // Ensure the subvector is half the size of the main vector.
14004 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
14005 return SDValue();
14006
14007 // Here narrow and wide refers to the vector element types. After "casting"
14008 // both vectors must have the same bit length and so because the subvector
14009 // has fewer elements, those elements need to be bigger.
14012
14013 // NOP cast operands to the largest legal vector of the same element count.
14014 if (VT.isFloatingPoint()) {
14015 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
14016 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
14017 } else {
14018 // Legal integer vectors are already their largest so Vec0 is fine as is.
14019 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
14020 }
14021
14022 // To replace the top/bottom half of vector V with vector SubV we widen the
14023 // preserved half of V, concatenate this to SubV (the order depending on the
14024 // half being replaced) and then narrow the result.
14025 SDValue Narrow;
14026 if (Idx == 0) {
14027 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14028 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14029 } else {
14031 "Invalid subvector index!");
14032 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14033 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14034 }
14035
14036 return getSVESafeBitCast(VT, Narrow, DAG);
14037 }
14038
14039 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14040 // This will be matched by custom code during ISelDAGToDAG.
14041 if (Vec0.isUndef())
14042 return Op;
14043
14044 std::optional<unsigned> PredPattern =
14046 auto PredTy = VT.changeVectorElementType(MVT::i1);
14047 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14048 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14049 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14050 }
14051
14052 return SDValue();
14053}
14054
14055static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14056 if (Op.getOpcode() != AArch64ISD::DUP &&
14057 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14058 Op.getOpcode() != ISD::BUILD_VECTOR)
14059 return false;
14060
14061 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14062 !isAllConstantBuildVector(Op, SplatVal))
14063 return false;
14064
14065 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14066 !isa<ConstantSDNode>(Op->getOperand(0)))
14067 return false;
14068
14069 SplatVal = Op->getConstantOperandVal(0);
14070 if (Op.getValueType().getVectorElementType() != MVT::i64)
14071 SplatVal = (int32_t)SplatVal;
14072
14073 Negated = false;
14074 if (isPowerOf2_64(SplatVal))
14075 return true;
14076
14077 Negated = true;
14078 if (isPowerOf2_64(-SplatVal)) {
14079 SplatVal = -SplatVal;
14080 return true;
14081 }
14082
14083 return false;
14084}
14085
14086SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14087 EVT VT = Op.getValueType();
14088 SDLoc dl(Op);
14089
14090 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14091 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14092
14093 assert(VT.isScalableVector() && "Expected a scalable vector.");
14094
14095 bool Signed = Op.getOpcode() == ISD::SDIV;
14096 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14097
14098 bool Negated;
14099 uint64_t SplatVal;
14100 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14101 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14102 SDValue Res =
14103 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14104 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14105 if (Negated)
14106 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14107
14108 return Res;
14109 }
14110
14111 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14112 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14113
14114 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14115 // operations, and truncate the result.
14116 EVT WidenedVT;
14117 if (VT == MVT::nxv16i8)
14118 WidenedVT = MVT::nxv8i16;
14119 else if (VT == MVT::nxv8i16)
14120 WidenedVT = MVT::nxv4i32;
14121 else
14122 llvm_unreachable("Unexpected Custom DIV operation");
14123
14124 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14125 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14126 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14127 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14128 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14129 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14130 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14131 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14132 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14133}
14134
14136 // Currently no fixed length shuffles that require SVE are legal.
14137 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14138 return false;
14139
14140 if (VT.getVectorNumElements() == 4 &&
14141 (VT.is128BitVector() || VT.is64BitVector())) {
14142 unsigned Cost = getPerfectShuffleCost(M);
14143 if (Cost <= 1)
14144 return true;
14145 }
14146
14147 bool DummyBool;
14148 int DummyInt;
14149 unsigned DummyUnsigned;
14150
14151 unsigned EltSize = VT.getScalarSizeInBits();
14152 unsigned NumElts = VT.getVectorNumElements();
14153 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
14154 isREVMask(M, EltSize, NumElts, 64) ||
14155 isREVMask(M, EltSize, NumElts, 32) ||
14156 isREVMask(M, EltSize, NumElts, 16) ||
14157 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14158 isTRNMask(M, NumElts, DummyUnsigned) ||
14159 isUZPMask(M, NumElts, DummyUnsigned) ||
14160 isZIPMask(M, NumElts, DummyUnsigned) ||
14161 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14162 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14163 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14164 isINSMask(M, NumElts, DummyBool, DummyInt) ||
14165 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14166}
14167
14169 EVT VT) const {
14170 // Just delegate to the generic legality, clear masks aren't special.
14171 return isShuffleMaskLegal(M, VT);
14172}
14173
14174/// getVShiftImm - Check if this is a valid build_vector for the immediate
14175/// operand of a vector shift operation, where all the elements of the
14176/// build_vector must have the same constant integer value.
14177static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14178 // Ignore bit_converts.
14179 while (Op.getOpcode() == ISD::BITCAST)
14180 Op = Op.getOperand(0);
14181 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14182 APInt SplatBits, SplatUndef;
14183 unsigned SplatBitSize;
14184 bool HasAnyUndefs;
14185 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14186 HasAnyUndefs, ElementBits) ||
14187 SplatBitSize > ElementBits)
14188 return false;
14189 Cnt = SplatBits.getSExtValue();
14190 return true;
14191}
14192
14193/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14194/// operand of a vector shift left operation. That value must be in the range:
14195/// 0 <= Value < ElementBits for a left shift; or
14196/// 0 <= Value <= ElementBits for a long left shift.
14197static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14198 assert(VT.isVector() && "vector shift count is not a vector type");
14199 int64_t ElementBits = VT.getScalarSizeInBits();
14200 if (!getVShiftImm(Op, ElementBits, Cnt))
14201 return false;
14202 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14203}
14204
14205/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14206/// operand of a vector shift right operation. The value must be in the range:
14207/// 1 <= Value <= ElementBits for a right shift; or
14208static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14209 assert(VT.isVector() && "vector shift count is not a vector type");
14210 int64_t ElementBits = VT.getScalarSizeInBits();
14211 if (!getVShiftImm(Op, ElementBits, Cnt))
14212 return false;
14213 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14214}
14215
14216SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14217 SelectionDAG &DAG) const {
14218 EVT VT = Op.getValueType();
14219
14220 if (VT.getScalarType() == MVT::i1) {
14221 // Lower i1 truncate to `(x & 1) != 0`.
14222 SDLoc dl(Op);
14223 EVT OpVT = Op.getOperand(0).getValueType();
14224 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14225 SDValue One = DAG.getConstant(1, dl, OpVT);
14226 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14227 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14228 }
14229
14230 if (!VT.isVector() || VT.isScalableVector())
14231 return SDValue();
14232
14233 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14234 !Subtarget->isNeonAvailable()))
14235 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14236
14237 return SDValue();
14238}
14239
14240// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14241// possibly a truncated type, it tells how many bits of the value are to be
14242// used.
14244 SelectionDAG &DAG,
14245 unsigned &ShiftValue,
14246 SDValue &RShOperand) {
14247 if (Shift->getOpcode() != ISD::SRL)
14248 return false;
14249
14250 EVT VT = Shift.getValueType();
14251 assert(VT.isScalableVT());
14252
14253 auto ShiftOp1 =
14254 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14255 if (!ShiftOp1)
14256 return false;
14257
14258 ShiftValue = ShiftOp1->getZExtValue();
14259 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14260 return false;
14261
14262 SDValue Add = Shift->getOperand(0);
14263 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14264 return false;
14265
14267 "ResVT must be truncated or same type as the shift.");
14268 // Check if an overflow can lead to incorrect results.
14269 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14270 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14271 return false;
14272
14273 auto AddOp1 =
14274 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14275 if (!AddOp1)
14276 return false;
14277 uint64_t AddValue = AddOp1->getZExtValue();
14278 if (AddValue != 1ULL << (ShiftValue - 1))
14279 return false;
14280
14281 RShOperand = Add->getOperand(0);
14282 return true;
14283}
14284
14285SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14286 SelectionDAG &DAG) const {
14287 EVT VT = Op.getValueType();
14288 SDLoc DL(Op);
14289 int64_t Cnt;
14290
14291 if (!Op.getOperand(1).getValueType().isVector())
14292 return Op;
14293 unsigned EltSize = VT.getScalarSizeInBits();
14294
14295 switch (Op.getOpcode()) {
14296 case ISD::SHL:
14297 if (VT.isScalableVector() ||
14299 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14300
14301 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14302 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14303 DAG.getConstant(Cnt, DL, MVT::i32));
14304 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14305 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14306 MVT::i32),
14307 Op.getOperand(0), Op.getOperand(1));
14308 case ISD::SRA:
14309 case ISD::SRL:
14310 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14311 SDValue RShOperand;
14312 unsigned ShiftValue;
14313 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14314 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14315 getPredicateForVector(DAG, DL, VT), RShOperand,
14316 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14317 }
14318
14319 if (VT.isScalableVector() ||
14320 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14321 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14323 return LowerToPredicatedOp(Op, DAG, Opc);
14324 }
14325
14326 // Right shift immediate
14327 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14328 unsigned Opc =
14329 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14330 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14331 DAG.getConstant(Cnt, DL, MVT::i32));
14332 }
14333
14334 // Right shift register. Note, there is not a shift right register
14335 // instruction, but the shift left register instruction takes a signed
14336 // value, where negative numbers specify a right shift.
14337 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14338 : Intrinsic::aarch64_neon_ushl;
14339 // negate the shift amount
14340 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14341 Op.getOperand(1));
14342 SDValue NegShiftLeft =
14344 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14345 NegShift);
14346 return NegShiftLeft;
14347 }
14348
14349 llvm_unreachable("unexpected shift opcode");
14350}
14351
14353 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14354 const SDLoc &dl, SelectionDAG &DAG) {
14355 EVT SrcVT = LHS.getValueType();
14356 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14357 "function only supposed to emit natural comparisons");
14358
14359 APInt SplatValue;
14360 APInt SplatUndef;
14361 unsigned SplatBitSize = 0;
14362 bool HasAnyUndefs;
14363
14364 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14365 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14366 SplatBitSize, HasAnyUndefs);
14367
14368 bool IsZero = IsCnst && SplatValue == 0;
14369 bool IsOne =
14370 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14371 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14372
14373 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14374 switch (CC) {
14375 default:
14376 return SDValue();
14377 case AArch64CC::NE: {
14378 SDValue Fcmeq;
14379 if (IsZero)
14380 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14381 else
14382 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14383 return DAG.getNOT(dl, Fcmeq, VT);
14384 }
14385 case AArch64CC::EQ:
14386 if (IsZero)
14387 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14388 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14389 case AArch64CC::GE:
14390 if (IsZero)
14391 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14392 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14393 case AArch64CC::GT:
14394 if (IsZero)
14395 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14396 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14397 case AArch64CC::LE:
14398 if (!NoNans)
14399 return SDValue();
14400 // If we ignore NaNs then we can use to the LS implementation.
14401 [[fallthrough]];
14402 case AArch64CC::LS:
14403 if (IsZero)
14404 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14405 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14406 case AArch64CC::LT:
14407 if (!NoNans)
14408 return SDValue();
14409 // If we ignore NaNs then we can use to the MI implementation.
14410 [[fallthrough]];
14411 case AArch64CC::MI:
14412 if (IsZero)
14413 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14414 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14415 }
14416 }
14417
14418 switch (CC) {
14419 default:
14420 return SDValue();
14421 case AArch64CC::NE: {
14422 SDValue Cmeq;
14423 if (IsZero)
14424 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14425 else
14426 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14427 return DAG.getNOT(dl, Cmeq, VT);
14428 }
14429 case AArch64CC::EQ:
14430 if (IsZero)
14431 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14432 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14433 case AArch64CC::GE:
14434 if (IsZero)
14435 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14436 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14437 case AArch64CC::GT:
14438 if (IsZero)
14439 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14440 if (IsMinusOne)
14441 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14442 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14443 case AArch64CC::LE:
14444 if (IsZero)
14445 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14446 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14447 case AArch64CC::LS:
14448 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14449 case AArch64CC::LO:
14450 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14451 case AArch64CC::LT:
14452 if (IsZero)
14453 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14454 if (IsOne)
14455 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14456 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14457 case AArch64CC::HI:
14458 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14459 case AArch64CC::HS:
14460 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14461 }
14462}
14463
14464SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14465 SelectionDAG &DAG) const {
14466 if (Op.getValueType().isScalableVector())
14467 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14468
14469 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14470 !Subtarget->isNeonAvailable()))
14471 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14472
14473 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14474 SDValue LHS = Op.getOperand(0);
14475 SDValue RHS = Op.getOperand(1);
14476 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14477 SDLoc dl(Op);
14478
14479 if (LHS.getValueType().getVectorElementType().isInteger()) {
14480 assert(LHS.getValueType() == RHS.getValueType());
14482 SDValue Cmp =
14483 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14484 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14485 }
14486
14487 // Lower isnan(x) | isnan(never-nan) to x != x.
14488 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14489 if (CC == ISD::SETUO || CC == ISD::SETO) {
14490 bool OneNaN = false;
14491 if (LHS == RHS) {
14492 OneNaN = true;
14493 } else if (DAG.isKnownNeverNaN(RHS)) {
14494 OneNaN = true;
14495 RHS = LHS;
14496 } else if (DAG.isKnownNeverNaN(LHS)) {
14497 OneNaN = true;
14498 LHS = RHS;
14499 }
14500 if (OneNaN) {
14502 }
14503 }
14504
14505 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14506
14507 // Make v4f16 (only) fcmp operations utilise vector instructions
14508 // v8f16 support will be a litle more complicated
14509 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14510 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14511 if (LHS.getValueType().getVectorNumElements() == 4) {
14512 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14513 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14514 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14515 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14516 CmpVT = MVT::v4i32;
14517 } else
14518 return SDValue();
14519 }
14520
14521 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14522 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14523 LHS.getValueType().getVectorElementType() != MVT::f128);
14524
14525 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14526 // clean. Some of them require two branches to implement.
14527 AArch64CC::CondCode CC1, CC2;
14528 bool ShouldInvert;
14529 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14530
14531 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14532 SDValue Cmp =
14533 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14534 if (!Cmp.getNode())
14535 return SDValue();
14536
14537 if (CC2 != AArch64CC::AL) {
14538 SDValue Cmp2 =
14539 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14540 if (!Cmp2.getNode())
14541 return SDValue();
14542
14543 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14544 }
14545
14546 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14547
14548 if (ShouldInvert)
14549 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14550
14551 return Cmp;
14552}
14553
14554static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14555 SelectionDAG &DAG) {
14556 SDValue VecOp = ScalarOp.getOperand(0);
14557 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14558 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14559 DAG.getConstant(0, DL, MVT::i64));
14560}
14561
14562static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14563 SDLoc DL, SelectionDAG &DAG) {
14564 unsigned ScalarOpcode;
14565 switch (Opcode) {
14566 case ISD::VECREDUCE_AND:
14567 ScalarOpcode = ISD::AND;
14568 break;
14569 case ISD::VECREDUCE_OR:
14570 ScalarOpcode = ISD::OR;
14571 break;
14572 case ISD::VECREDUCE_XOR:
14573 ScalarOpcode = ISD::XOR;
14574 break;
14575 default:
14576 llvm_unreachable("Expected bitwise vector reduction");
14577 return SDValue();
14578 }
14579
14580 EVT VecVT = Vec.getValueType();
14581 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14582 "Expected power-of-2 length vector");
14583
14584 EVT ElemVT = VecVT.getVectorElementType();
14585
14586 SDValue Result;
14587 unsigned NumElems = VecVT.getVectorNumElements();
14588
14589 // Special case for boolean reductions
14590 if (ElemVT == MVT::i1) {
14591 // Split large vectors into smaller ones
14592 if (NumElems > 16) {
14593 SDValue Lo, Hi;
14594 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14595 EVT HalfVT = Lo.getValueType();
14596 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14597 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14598 }
14599
14600 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14601 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14602 // this element size leads to the best codegen, since e.g. setcc results
14603 // might need to be truncated otherwise.
14604 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14605
14606 // any_ext doesn't work with umin/umax, so only use it for uadd.
14607 unsigned ExtendOp =
14608 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14609 SDValue Extended = DAG.getNode(
14610 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14611 switch (ScalarOpcode) {
14612 case ISD::AND:
14613 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14614 break;
14615 case ISD::OR:
14616 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14617 break;
14618 case ISD::XOR:
14619 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14620 break;
14621 default:
14622 llvm_unreachable("Unexpected Opcode");
14623 }
14624
14625 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14626 } else {
14627 // Iteratively split the vector in half and combine using the bitwise
14628 // operation until it fits in a 64 bit register.
14629 while (VecVT.getSizeInBits() > 64) {
14630 SDValue Lo, Hi;
14631 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14632 VecVT = Lo.getValueType();
14633 NumElems = VecVT.getVectorNumElements();
14634 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14635 }
14636
14637 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14638
14639 // Do the remaining work on a scalar since it allows the code generator to
14640 // combine the shift and bitwise operation into one instruction and since
14641 // integer instructions can have higher throughput than vector instructions.
14642 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14643
14644 // Iteratively combine the lower and upper halves of the scalar using the
14645 // bitwise operation, halving the relevant region of the scalar in each
14646 // iteration, until the relevant region is just one element of the original
14647 // vector.
14648 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14649 SDValue ShiftAmount =
14650 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14651 SDValue Shifted =
14652 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14653 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14654 }
14655
14656 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14657 }
14658
14659 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14660}
14661
14662SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14663 SelectionDAG &DAG) const {
14664 SDValue Src = Op.getOperand(0);
14665
14666 // Try to lower fixed length reductions to SVE.
14667 EVT SrcVT = Src.getValueType();
14668 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14669 Op.getOpcode() == ISD::VECREDUCE_AND ||
14670 Op.getOpcode() == ISD::VECREDUCE_OR ||
14671 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14672 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14673 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14674 SrcVT.getVectorElementType() == MVT::i64);
14675 if (SrcVT.isScalableVector() ||
14677 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14678
14679 if (SrcVT.getVectorElementType() == MVT::i1)
14680 return LowerPredReductionToSVE(Op, DAG);
14681
14682 switch (Op.getOpcode()) {
14683 case ISD::VECREDUCE_ADD:
14684 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14685 case ISD::VECREDUCE_AND:
14686 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14687 case ISD::VECREDUCE_OR:
14688 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14690 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14692 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14694 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14696 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14697 case ISD::VECREDUCE_XOR:
14698 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14700 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14702 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14704 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14706 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14708 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14709 default:
14710 llvm_unreachable("Unhandled fixed length reduction");
14711 }
14712 }
14713
14714 // Lower NEON reductions.
14715 SDLoc dl(Op);
14716 switch (Op.getOpcode()) {
14717 case ISD::VECREDUCE_AND:
14718 case ISD::VECREDUCE_OR:
14719 case ISD::VECREDUCE_XOR:
14720 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14721 Op.getValueType(), dl, DAG);
14722 case ISD::VECREDUCE_ADD:
14723 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14725 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14727 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14729 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14731 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14732 default:
14733 llvm_unreachable("Unhandled reduction");
14734 }
14735}
14736
14737SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14738 SelectionDAG &DAG) const {
14739 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14740 // No point replacing if we don't have the relevant instruction/libcall anyway
14741 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14742 return SDValue();
14743
14744 // LSE has an atomic load-clear instruction, but not a load-and.
14745 SDLoc dl(Op);
14746 MVT VT = Op.getSimpleValueType();
14747 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14748 SDValue RHS = Op.getOperand(2);
14749 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14750 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14751 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14752 Op.getOperand(0), Op.getOperand(1), RHS,
14753 AN->getMemOperand());
14754}
14755
14756SDValue
14757AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14758 SelectionDAG &DAG) const {
14759
14760 SDLoc dl(Op);
14761 // Get the inputs.
14762 SDNode *Node = Op.getNode();
14763 SDValue Chain = Op.getOperand(0);
14764 SDValue Size = Op.getOperand(1);
14766 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14767 EVT VT = Node->getValueType(0);
14768
14770 "no-stack-arg-probe")) {
14771 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14772 Chain = SP.getValue(1);
14773 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14774 if (Align)
14775 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14776 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14777 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14778 SDValue Ops[2] = {SP, Chain};
14779 return DAG.getMergeValues(Ops, dl);
14780 }
14781
14782 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14783
14784 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14786 PtrVT, 0);
14787
14788 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14789 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14790 if (Subtarget->hasCustomCallingConv())
14791 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14792
14793 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14794 DAG.getConstant(4, dl, MVT::i64));
14795 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14796 Chain =
14797 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14798 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14799 DAG.getRegisterMask(Mask), Chain.getValue(1));
14800 // To match the actual intent better, we should read the output from X15 here
14801 // again (instead of potentially spilling it to the stack), but rereading Size
14802 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14803 // here.
14804
14805 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14806 DAG.getConstant(4, dl, MVT::i64));
14807
14808 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14809 Chain = SP.getValue(1);
14810 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14811 if (Align)
14812 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14813 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14814 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14815
14816 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14817
14818 SDValue Ops[2] = {SP, Chain};
14819 return DAG.getMergeValues(Ops, dl);
14820}
14821
14822SDValue
14823AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14824 SelectionDAG &DAG) const {
14825 // Get the inputs.
14826 SDNode *Node = Op.getNode();
14827 SDValue Chain = Op.getOperand(0);
14828 SDValue Size = Op.getOperand(1);
14829
14831 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14832 SDLoc dl(Op);
14833 EVT VT = Node->getValueType(0);
14834
14835 // Construct the new SP value in a GPR.
14836 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14837 Chain = SP.getValue(1);
14838 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14839 if (Align)
14840 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14841 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14842
14843 // Set the real SP to the new value with a probing loop.
14844 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14845 SDValue Ops[2] = {SP, Chain};
14846 return DAG.getMergeValues(Ops, dl);
14847}
14848
14849SDValue
14850AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14851 SelectionDAG &DAG) const {
14853
14854 if (Subtarget->isTargetWindows())
14855 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14856 else if (hasInlineStackProbe(MF))
14857 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14858 else
14859 return SDValue();
14860}
14861
14862// When x and y are extended, lower:
14863// avgfloor(x, y) -> (x + y) >> 1
14864// avgceil(x, y) -> (x + y + 1) >> 1
14865
14866// Otherwise, lower to:
14867// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14868// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14869SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14870 unsigned NewOp) const {
14871 if (Subtarget->hasSVE2())
14872 return LowerToPredicatedOp(Op, DAG, NewOp);
14873
14874 SDLoc dl(Op);
14875 SDValue OpA = Op->getOperand(0);
14876 SDValue OpB = Op->getOperand(1);
14877 EVT VT = Op.getValueType();
14878 bool IsCeil =
14879 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14880 bool IsSigned =
14881 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14882 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14883
14884 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14885
14886 auto IsZeroExtended = [&DAG](SDValue &Node) {
14887 KnownBits Known = DAG.computeKnownBits(Node, 0);
14888 return Known.Zero.isSignBitSet();
14889 };
14890
14891 auto IsSignExtended = [&DAG](SDValue &Node) {
14892 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14893 };
14894
14895 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14896 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14897 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14898 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14899 if (IsCeil)
14900 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14901 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14902 }
14903
14904 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14905 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14906
14907 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14908 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14909 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14910 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14911}
14912
14913SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14914 SelectionDAG &DAG) const {
14915 EVT VT = Op.getValueType();
14916 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14917
14918 SDLoc DL(Op);
14919 APInt MulImm = Op.getConstantOperandAPInt(0);
14920 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14921 VT);
14922}
14923
14924/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14925template <unsigned NumVecs>
14926static bool
14930 // Retrieve EC from first vector argument.
14931 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14933#ifndef NDEBUG
14934 // Check the assumption that all input vectors are the same type.
14935 for (unsigned I = 0; I < NumVecs; ++I)
14936 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14937 "Invalid type.");
14938#endif
14939 // memVT is `NumVecs * VT`.
14941 EC * NumVecs);
14942 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14943 Info.offset = 0;
14944 Info.align.reset();
14946 return true;
14947}
14948
14949/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14950/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14951/// specified in the intrinsic calls.
14953 const CallInst &I,
14954 MachineFunction &MF,
14955 unsigned Intrinsic) const {
14956 auto &DL = I.getModule()->getDataLayout();
14957 switch (Intrinsic) {
14958 case Intrinsic::aarch64_sve_st2:
14959 return setInfoSVEStN<2>(*this, DL, Info, I);
14960 case Intrinsic::aarch64_sve_st3:
14961 return setInfoSVEStN<3>(*this, DL, Info, I);
14962 case Intrinsic::aarch64_sve_st4:
14963 return setInfoSVEStN<4>(*this, DL, Info, I);
14964 case Intrinsic::aarch64_neon_ld2:
14965 case Intrinsic::aarch64_neon_ld3:
14966 case Intrinsic::aarch64_neon_ld4:
14967 case Intrinsic::aarch64_neon_ld1x2:
14968 case Intrinsic::aarch64_neon_ld1x3:
14969 case Intrinsic::aarch64_neon_ld1x4: {
14971 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14972 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14973 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14974 Info.offset = 0;
14975 Info.align.reset();
14976 // volatile loads with NEON intrinsics not supported
14978 return true;
14979 }
14980 case Intrinsic::aarch64_neon_ld2lane:
14981 case Intrinsic::aarch64_neon_ld3lane:
14982 case Intrinsic::aarch64_neon_ld4lane:
14983 case Intrinsic::aarch64_neon_ld2r:
14984 case Intrinsic::aarch64_neon_ld3r:
14985 case Intrinsic::aarch64_neon_ld4r: {
14987 // ldx return struct with the same vec type
14988 Type *RetTy = I.getType();
14989 auto *StructTy = cast<StructType>(RetTy);
14990 unsigned NumElts = StructTy->getNumElements();
14991 Type *VecTy = StructTy->getElementType(0);
14992 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14993 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14994 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14995 Info.offset = 0;
14996 Info.align.reset();
14997 // volatile loads with NEON intrinsics not supported
14999 return true;
15000 }
15001 case Intrinsic::aarch64_neon_st2:
15002 case Intrinsic::aarch64_neon_st3:
15003 case Intrinsic::aarch64_neon_st4:
15004 case Intrinsic::aarch64_neon_st1x2:
15005 case Intrinsic::aarch64_neon_st1x3:
15006 case Intrinsic::aarch64_neon_st1x4: {
15008 unsigned NumElts = 0;
15009 for (const Value *Arg : I.args()) {
15010 Type *ArgTy = Arg->getType();
15011 if (!ArgTy->isVectorTy())
15012 break;
15013 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
15014 }
15015 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15016 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15017 Info.offset = 0;
15018 Info.align.reset();
15019 // volatile stores with NEON intrinsics not supported
15021 return true;
15022 }
15023 case Intrinsic::aarch64_neon_st2lane:
15024 case Intrinsic::aarch64_neon_st3lane:
15025 case Intrinsic::aarch64_neon_st4lane: {
15027 unsigned NumElts = 0;
15028 // all the vector type is same
15029 Type *VecTy = I.getArgOperand(0)->getType();
15030 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15031
15032 for (const Value *Arg : I.args()) {
15033 Type *ArgTy = Arg->getType();
15034 if (!ArgTy->isVectorTy())
15035 break;
15036 NumElts += 1;
15037 }
15038
15039 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15040 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15041 Info.offset = 0;
15042 Info.align.reset();
15043 // volatile stores with NEON intrinsics not supported
15045 return true;
15046 }
15047 case Intrinsic::aarch64_ldaxr:
15048 case Intrinsic::aarch64_ldxr: {
15049 Type *ValTy = I.getParamElementType(0);
15051 Info.memVT = MVT::getVT(ValTy);
15052 Info.ptrVal = I.getArgOperand(0);
15053 Info.offset = 0;
15054 Info.align = DL.getABITypeAlign(ValTy);
15056 return true;
15057 }
15058 case Intrinsic::aarch64_stlxr:
15059 case Intrinsic::aarch64_stxr: {
15060 Type *ValTy = I.getParamElementType(1);
15062 Info.memVT = MVT::getVT(ValTy);
15063 Info.ptrVal = I.getArgOperand(1);
15064 Info.offset = 0;
15065 Info.align = DL.getABITypeAlign(ValTy);
15067 return true;
15068 }
15069 case Intrinsic::aarch64_ldaxp:
15070 case Intrinsic::aarch64_ldxp:
15072 Info.memVT = MVT::i128;
15073 Info.ptrVal = I.getArgOperand(0);
15074 Info.offset = 0;
15075 Info.align = Align(16);
15077 return true;
15078 case Intrinsic::aarch64_stlxp:
15079 case Intrinsic::aarch64_stxp:
15081 Info.memVT = MVT::i128;
15082 Info.ptrVal = I.getArgOperand(2);
15083 Info.offset = 0;
15084 Info.align = Align(16);
15086 return true;
15087 case Intrinsic::aarch64_sve_ldnt1: {
15088 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15090 Info.memVT = MVT::getVT(I.getType());
15091 Info.ptrVal = I.getArgOperand(1);
15092 Info.offset = 0;
15093 Info.align = DL.getABITypeAlign(ElTy);
15095 return true;
15096 }
15097 case Intrinsic::aarch64_sve_stnt1: {
15098 Type *ElTy =
15099 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15101 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15102 Info.ptrVal = I.getArgOperand(2);
15103 Info.offset = 0;
15104 Info.align = DL.getABITypeAlign(ElTy);
15106 return true;
15107 }
15108 case Intrinsic::aarch64_mops_memset_tag: {
15109 Value *Dst = I.getArgOperand(0);
15110 Value *Val = I.getArgOperand(1);
15112 Info.memVT = MVT::getVT(Val->getType());
15113 Info.ptrVal = Dst;
15114 Info.offset = 0;
15115 Info.align = I.getParamAlign(0).valueOrOne();
15117 // The size of the memory being operated on is unknown at this point
15119 return true;
15120 }
15121 default:
15122 break;
15123 }
15124
15125 return false;
15126}
15127
15129 ISD::LoadExtType ExtTy,
15130 EVT NewVT) const {
15131 // TODO: This may be worth removing. Check regression tests for diffs.
15132 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15133 return false;
15134
15135 // If we're reducing the load width in order to avoid having to use an extra
15136 // instruction to do extension then it's probably a good idea.
15137 if (ExtTy != ISD::NON_EXTLOAD)
15138 return true;
15139 // Don't reduce load width if it would prevent us from combining a shift into
15140 // the offset.
15141 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15142 assert(Mem);
15143 const SDValue &Base = Mem->getBasePtr();
15144 if (Base.getOpcode() == ISD::ADD &&
15145 Base.getOperand(1).getOpcode() == ISD::SHL &&
15146 Base.getOperand(1).hasOneUse() &&
15147 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15148 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15149 if (Mem->getMemoryVT().isScalableVector())
15150 return false;
15151 // The shift can be combined if it matches the size of the value being
15152 // loaded (and so reducing the width would make it not match).
15153 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15154 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15155 if (ShiftAmount == Log2_32(LoadBytes))
15156 return false;
15157 }
15158 // We have no reason to disallow reducing the load width, so allow it.
15159 return true;
15160}
15161
15162// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15164 EVT VT = Extend.getValueType();
15165 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15166 SDValue Extract = Extend.getOperand(0);
15167 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15168 Extract = Extract.getOperand(0);
15169 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15170 EVT VecVT = Extract.getOperand(0).getValueType();
15171 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15172 return false;
15173 }
15174 }
15175 return true;
15176}
15177
15178// Truncations from 64-bit GPR to 32-bit GPR is free.
15180 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15181 return false;
15182 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15183 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15184 return NumBits1 > NumBits2;
15185}
15187 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15188 return false;
15189 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15190 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15191 return NumBits1 > NumBits2;
15192}
15193
15194/// Check if it is profitable to hoist instruction in then/else to if.
15195/// Not profitable if I and it's user can form a FMA instruction
15196/// because we prefer FMSUB/FMADD.
15198 if (I->getOpcode() != Instruction::FMul)
15199 return true;
15200
15201 if (!I->hasOneUse())
15202 return true;
15203
15204 Instruction *User = I->user_back();
15205
15206 if (!(User->getOpcode() == Instruction::FSub ||
15207 User->getOpcode() == Instruction::FAdd))
15208 return true;
15209
15211 const Function *F = I->getFunction();
15212 const DataLayout &DL = F->getParent()->getDataLayout();
15213 Type *Ty = User->getOperand(0)->getType();
15214
15215 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15217 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15218 Options.UnsafeFPMath));
15219}
15220
15221// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15222// 64-bit GPR.
15224 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15225 return false;
15226 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15227 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15228 return NumBits1 == 32 && NumBits2 == 64;
15229}
15231 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15232 return false;
15233 unsigned NumBits1 = VT1.getSizeInBits();
15234 unsigned NumBits2 = VT2.getSizeInBits();
15235 return NumBits1 == 32 && NumBits2 == 64;
15236}
15237
15239 EVT VT1 = Val.getValueType();
15240 if (isZExtFree(VT1, VT2)) {
15241 return true;
15242 }
15243
15244 if (Val.getOpcode() != ISD::LOAD)
15245 return false;
15246
15247 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15248 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15249 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15250 VT1.getSizeInBits() <= 32);
15251}
15252
15253bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15254 if (isa<FPExtInst>(Ext))
15255 return false;
15256
15257 // Vector types are not free.
15258 if (Ext->getType()->isVectorTy())
15259 return false;
15260
15261 for (const Use &U : Ext->uses()) {
15262 // The extension is free if we can fold it with a left shift in an
15263 // addressing mode or an arithmetic operation: add, sub, and cmp.
15264
15265 // Is there a shift?
15266 const Instruction *Instr = cast<Instruction>(U.getUser());
15267
15268 // Is this a constant shift?
15269 switch (Instr->getOpcode()) {
15270 case Instruction::Shl:
15271 if (!isa<ConstantInt>(Instr->getOperand(1)))
15272 return false;
15273 break;
15274 case Instruction::GetElementPtr: {
15275 gep_type_iterator GTI = gep_type_begin(Instr);
15276 auto &DL = Ext->getModule()->getDataLayout();
15277 std::advance(GTI, U.getOperandNo()-1);
15278 Type *IdxTy = GTI.getIndexedType();
15279 // This extension will end up with a shift because of the scaling factor.
15280 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15281 // Get the shift amount based on the scaling factor:
15282 // log2(sizeof(IdxTy)) - log2(8).
15283 if (IdxTy->isScalableTy())
15284 return false;
15285 uint64_t ShiftAmt =
15286 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15287 3;
15288 // Is the constant foldable in the shift of the addressing mode?
15289 // I.e., shift amount is between 1 and 4 inclusive.
15290 if (ShiftAmt == 0 || ShiftAmt > 4)
15291 return false;
15292 break;
15293 }
15294 case Instruction::Trunc:
15295 // Check if this is a noop.
15296 // trunc(sext ty1 to ty2) to ty1.
15297 if (Instr->getType() == Ext->getOperand(0)->getType())
15298 continue;
15299 [[fallthrough]];
15300 default:
15301 return false;
15302 }
15303
15304 // At this point we can use the bfm family, so this extension is free
15305 // for that use.
15306 }
15307 return true;
15308}
15309
15310static bool isSplatShuffle(Value *V) {
15311 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15312 return all_equal(Shuf->getShuffleMask());
15313 return false;
15314}
15315
15316/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15317/// or upper half of the vector elements.
15318static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15319 bool AllowSplat = false) {
15320 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15321 auto *FullTy = FullV->getType();
15322 auto *HalfTy = HalfV->getType();
15323 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15324 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15325 };
15326
15327 auto extractHalf = [](Value *FullV, Value *HalfV) {
15328 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15329 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15330 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15331 };
15332
15333 ArrayRef<int> M1, M2;
15334 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15335 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15336 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15337 return false;
15338
15339 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15340 // it is not checked as an extract below.
15341 if (AllowSplat && isSplatShuffle(Op1))
15342 S1Op1 = nullptr;
15343 if (AllowSplat && isSplatShuffle(Op2))
15344 S2Op1 = nullptr;
15345
15346 // Check that the operands are half as wide as the result and we extract
15347 // half of the elements of the input vectors.
15348 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15349 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15350 return false;
15351
15352 // Check the mask extracts either the lower or upper half of vector
15353 // elements.
15354 int M1Start = 0;
15355 int M2Start = 0;
15356 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15357 if ((S1Op1 &&
15358 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15359 (S2Op1 &&
15360 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15361 return false;
15362
15363 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15364 (M2Start != 0 && M2Start != (NumElements / 2)))
15365 return false;
15366 if (S1Op1 && S2Op1 && M1Start != M2Start)
15367 return false;
15368
15369 return true;
15370}
15371
15372/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15373/// of the vector elements.
15374static bool areExtractExts(Value *Ext1, Value *Ext2) {
15375 auto areExtDoubled = [](Instruction *Ext) {
15376 return Ext->getType()->getScalarSizeInBits() ==
15377 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15378 };
15379
15380 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15381 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15382 !areExtDoubled(cast<Instruction>(Ext1)) ||
15383 !areExtDoubled(cast<Instruction>(Ext2)))
15384 return false;
15385
15386 return true;
15387}
15388
15389/// Check if Op could be used with vmull_high_p64 intrinsic.
15391 Value *VectorOperand = nullptr;
15392 ConstantInt *ElementIndex = nullptr;
15393 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15394 m_ConstantInt(ElementIndex))) &&
15395 ElementIndex->getValue() == 1 &&
15396 isa<FixedVectorType>(VectorOperand->getType()) &&
15397 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15398}
15399
15400/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15401static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15403}
15404
15406 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15407 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15408 if (!GEP || GEP->getNumOperands() != 2)
15409 return false;
15410
15411 Value *Base = GEP->getOperand(0);
15412 Value *Offsets = GEP->getOperand(1);
15413
15414 // We only care about scalar_base+vector_offsets.
15415 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15416 return false;
15417
15418 // Sink extends that would allow us to use 32-bit offset vectors.
15419 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15420 auto *OffsetsInst = cast<Instruction>(Offsets);
15421 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15422 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15423 Ops.push_back(&GEP->getOperandUse(1));
15424 }
15425
15426 // Sink the GEP.
15427 return true;
15428}
15429
15430/// We want to sink following cases:
15431/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15433 if (match(Op, m_VScale()))
15434 return true;
15435 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15437 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15438 return true;
15439 }
15440 return false;
15441}
15442
15443/// Check if sinking \p I's operands to I's basic block is profitable, because
15444/// the operands can be folded into a target instruction, e.g.
15445/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15447 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15448 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15449 switch (II->getIntrinsicID()) {
15450 case Intrinsic::aarch64_neon_smull:
15451 case Intrinsic::aarch64_neon_umull:
15452 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15453 /*AllowSplat=*/true)) {
15454 Ops.push_back(&II->getOperandUse(0));
15455 Ops.push_back(&II->getOperandUse(1));
15456 return true;
15457 }
15458 [[fallthrough]];
15459
15460 case Intrinsic::fma:
15461 if (isa<VectorType>(I->getType()) &&
15462 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15463 !Subtarget->hasFullFP16())
15464 return false;
15465 [[fallthrough]];
15466 case Intrinsic::aarch64_neon_sqdmull:
15467 case Intrinsic::aarch64_neon_sqdmulh:
15468 case Intrinsic::aarch64_neon_sqrdmulh:
15469 // Sink splats for index lane variants
15470 if (isSplatShuffle(II->getOperand(0)))
15471 Ops.push_back(&II->getOperandUse(0));
15472 if (isSplatShuffle(II->getOperand(1)))
15473 Ops.push_back(&II->getOperandUse(1));
15474 return !Ops.empty();
15475 case Intrinsic::aarch64_neon_fmlal:
15476 case Intrinsic::aarch64_neon_fmlal2:
15477 case Intrinsic::aarch64_neon_fmlsl:
15478 case Intrinsic::aarch64_neon_fmlsl2:
15479 // Sink splats for index lane variants
15480 if (isSplatShuffle(II->getOperand(1)))
15481 Ops.push_back(&II->getOperandUse(1));
15482 if (isSplatShuffle(II->getOperand(2)))
15483 Ops.push_back(&II->getOperandUse(2));
15484 return !Ops.empty();
15485 case Intrinsic::aarch64_sve_ptest_first:
15486 case Intrinsic::aarch64_sve_ptest_last:
15487 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15488 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15489 Ops.push_back(&II->getOperandUse(0));
15490 return !Ops.empty();
15491 case Intrinsic::aarch64_sme_write_horiz:
15492 case Intrinsic::aarch64_sme_write_vert:
15493 case Intrinsic::aarch64_sme_writeq_horiz:
15494 case Intrinsic::aarch64_sme_writeq_vert: {
15495 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15496 if (!Idx || Idx->getOpcode() != Instruction::Add)
15497 return false;
15498 Ops.push_back(&II->getOperandUse(1));
15499 return true;
15500 }
15501 case Intrinsic::aarch64_sme_read_horiz:
15502 case Intrinsic::aarch64_sme_read_vert:
15503 case Intrinsic::aarch64_sme_readq_horiz:
15504 case Intrinsic::aarch64_sme_readq_vert:
15505 case Intrinsic::aarch64_sme_ld1b_vert:
15506 case Intrinsic::aarch64_sme_ld1h_vert:
15507 case Intrinsic::aarch64_sme_ld1w_vert:
15508 case Intrinsic::aarch64_sme_ld1d_vert:
15509 case Intrinsic::aarch64_sme_ld1q_vert:
15510 case Intrinsic::aarch64_sme_st1b_vert:
15511 case Intrinsic::aarch64_sme_st1h_vert:
15512 case Intrinsic::aarch64_sme_st1w_vert:
15513 case Intrinsic::aarch64_sme_st1d_vert:
15514 case Intrinsic::aarch64_sme_st1q_vert:
15515 case Intrinsic::aarch64_sme_ld1b_horiz:
15516 case Intrinsic::aarch64_sme_ld1h_horiz:
15517 case Intrinsic::aarch64_sme_ld1w_horiz:
15518 case Intrinsic::aarch64_sme_ld1d_horiz:
15519 case Intrinsic::aarch64_sme_ld1q_horiz:
15520 case Intrinsic::aarch64_sme_st1b_horiz:
15521 case Intrinsic::aarch64_sme_st1h_horiz:
15522 case Intrinsic::aarch64_sme_st1w_horiz:
15523 case Intrinsic::aarch64_sme_st1d_horiz:
15524 case Intrinsic::aarch64_sme_st1q_horiz: {
15525 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15526 if (!Idx || Idx->getOpcode() != Instruction::Add)
15527 return false;
15528 Ops.push_back(&II->getOperandUse(3));
15529 return true;
15530 }
15531 case Intrinsic::aarch64_neon_pmull:
15532 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15533 return false;
15534 Ops.push_back(&II->getOperandUse(0));
15535 Ops.push_back(&II->getOperandUse(1));
15536 return true;
15537 case Intrinsic::aarch64_neon_pmull64:
15538 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15539 II->getArgOperand(1)))
15540 return false;
15541 Ops.push_back(&II->getArgOperandUse(0));
15542 Ops.push_back(&II->getArgOperandUse(1));
15543 return true;
15544 case Intrinsic::masked_gather:
15545 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15546 return false;
15547 Ops.push_back(&II->getArgOperandUse(0));
15548 return true;
15549 case Intrinsic::masked_scatter:
15550 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15551 return false;
15552 Ops.push_back(&II->getArgOperandUse(1));
15553 return true;
15554 default:
15555 return false;
15556 }
15557 }
15558
15559 // Sink vscales closer to uses for better isel
15560 switch (I->getOpcode()) {
15561 case Instruction::GetElementPtr:
15562 case Instruction::Add:
15563 case Instruction::Sub:
15564 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15565 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15566 Ops.push_back(&I->getOperandUse(Op));
15567 return true;
15568 }
15569 }
15570 break;
15571 default:
15572 break;
15573 }
15574
15575 if (!I->getType()->isVectorTy())
15576 return false;
15577
15578 switch (I->getOpcode()) {
15579 case Instruction::Sub:
15580 case Instruction::Add: {
15581 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15582 return false;
15583
15584 // If the exts' operands extract either the lower or upper elements, we
15585 // can sink them too.
15586 auto Ext1 = cast<Instruction>(I->getOperand(0));
15587 auto Ext2 = cast<Instruction>(I->getOperand(1));
15588 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15589 Ops.push_back(&Ext1->getOperandUse(0));
15590 Ops.push_back(&Ext2->getOperandUse(0));
15591 }
15592
15593 Ops.push_back(&I->getOperandUse(0));
15594 Ops.push_back(&I->getOperandUse(1));
15595
15596 return true;
15597 }
15598 case Instruction::Or: {
15599 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15600 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15601 if (Subtarget->hasNEON()) {
15602 Instruction *OtherAnd, *IA, *IB;
15603 Value *MaskValue;
15604 // MainAnd refers to And instruction that has 'Not' as one of its operands
15605 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15606 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15607 m_Instruction(IA)))))) {
15608 if (match(OtherAnd,
15609 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15610 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15611 ? cast<Instruction>(I->getOperand(1))
15612 : cast<Instruction>(I->getOperand(0));
15613
15614 // Both Ands should be in same basic block as Or
15615 if (I->getParent() != MainAnd->getParent() ||
15616 I->getParent() != OtherAnd->getParent())
15617 return false;
15618
15619 // Non-mask operands of both Ands should also be in same basic block
15620 if (I->getParent() != IA->getParent() ||
15621 I->getParent() != IB->getParent())
15622 return false;
15623
15624 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15625 Ops.push_back(&I->getOperandUse(0));
15626 Ops.push_back(&I->getOperandUse(1));
15627
15628 return true;
15629 }
15630 }
15631 }
15632
15633 return false;
15634 }
15635 case Instruction::Mul: {
15636 int NumZExts = 0, NumSExts = 0;
15637 for (auto &Op : I->operands()) {
15638 // Make sure we are not already sinking this operand
15639 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15640 continue;
15641
15642 if (match(&Op, m_SExt(m_Value()))) {
15643 NumSExts++;
15644 continue;
15645 } else if (match(&Op, m_ZExt(m_Value()))) {
15646 NumZExts++;
15647 continue;
15648 }
15649
15650 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15651
15652 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15653 // operand and the s/zext can help create indexed s/umull. This is
15654 // especially useful to prevent i64 mul being scalarized.
15655 if (Shuffle && isSplatShuffle(Shuffle) &&
15656 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15657 Ops.push_back(&Shuffle->getOperandUse(0));
15658 Ops.push_back(&Op);
15659 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15660 NumSExts++;
15661 else
15662 NumZExts++;
15663 continue;
15664 }
15665
15666 if (!Shuffle)
15667 continue;
15668
15669 Value *ShuffleOperand = Shuffle->getOperand(0);
15670 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15671 if (!Insert)
15672 continue;
15673
15674 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15675 if (!OperandInstr)
15676 continue;
15677
15678 ConstantInt *ElementConstant =
15679 dyn_cast<ConstantInt>(Insert->getOperand(2));
15680 // Check that the insertelement is inserting into element 0
15681 if (!ElementConstant || !ElementConstant->isZero())
15682 continue;
15683
15684 unsigned Opcode = OperandInstr->getOpcode();
15685 if (Opcode == Instruction::SExt)
15686 NumSExts++;
15687 else if (Opcode == Instruction::ZExt)
15688 NumZExts++;
15689 else {
15690 // If we find that the top bits are known 0, then we can sink and allow
15691 // the backend to generate a umull.
15692 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15693 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15694 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15695 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15696 continue;
15697 NumZExts++;
15698 }
15699
15700 Ops.push_back(&Shuffle->getOperandUse(0));
15701 Ops.push_back(&Op);
15702 }
15703
15704 // Is it profitable to sink if we found two of the same type of extends.
15705 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15706 }
15707 default:
15708 return false;
15709 }
15710 return false;
15711}
15712
15714 bool IsLittleEndian) {
15715 Value *Op = ZExt->getOperand(0);
15716 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15717 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15718 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15719 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15720 return false;
15721
15722 assert(DstWidth % SrcWidth == 0 &&
15723 "TBL lowering is not supported for a ZExt instruction with this "
15724 "source & destination element type.");
15725 unsigned ZExtFactor = DstWidth / SrcWidth;
15726 unsigned NumElts = SrcTy->getNumElements();
15727 IRBuilder<> Builder(ZExt);
15728 SmallVector<int> Mask;
15729 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15730 // vector to replace the original ZExt. This can later be lowered to a set of
15731 // tbl instructions.
15732 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15733 if (IsLittleEndian) {
15734 if (i % ZExtFactor == 0)
15735 Mask.push_back(i / ZExtFactor);
15736 else
15737 Mask.push_back(NumElts);
15738 } else {
15739 if ((i + 1) % ZExtFactor == 0)
15740 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15741 else
15742 Mask.push_back(NumElts);
15743 }
15744 }
15745
15746 auto *FirstEltZero = Builder.CreateInsertElement(
15747 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15748 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15749 Result = Builder.CreateBitCast(Result, DstTy);
15750 if (DstTy != ZExt->getType())
15751 Result = Builder.CreateZExt(Result, ZExt->getType());
15752 ZExt->replaceAllUsesWith(Result);
15753 ZExt->eraseFromParent();
15754 return true;
15755}
15756
15757static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15758 IRBuilder<> Builder(TI);
15760 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15761 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15762 auto *DstTy = cast<FixedVectorType>(TI->getType());
15763 assert(SrcTy->getElementType()->isIntegerTy() &&
15764 "Non-integer type source vector element is not supported");
15765 assert(DstTy->getElementType()->isIntegerTy(8) &&
15766 "Unsupported destination vector element type");
15767 unsigned SrcElemTySz =
15768 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15769 unsigned DstElemTySz =
15770 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15771 assert((SrcElemTySz % DstElemTySz == 0) &&
15772 "Cannot lower truncate to tbl instructions for a source element size "
15773 "that is not divisible by the destination element size");
15774 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15775 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15776 "Unsupported source vector element type size");
15777 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15778
15779 // Create a mask to choose every nth byte from the source vector table of
15780 // bytes to create the truncated destination vector, where 'n' is the truncate
15781 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15782 // 0,8,16,..Y*8th bytes for the little-endian format
15784 for (int Itr = 0; Itr < 16; Itr++) {
15785 if (Itr < NumElements)
15786 MaskConst.push_back(Builder.getInt8(
15787 IsLittleEndian ? Itr * TruncFactor
15788 : Itr * TruncFactor + (TruncFactor - 1)));
15789 else
15790 MaskConst.push_back(Builder.getInt8(255));
15791 }
15792
15793 int MaxTblSz = 128 * 4;
15794 int MaxSrcSz = SrcElemTySz * NumElements;
15795 int ElemsPerTbl =
15796 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15797 assert(ElemsPerTbl <= 16 &&
15798 "Maximum elements selected using TBL instruction cannot exceed 16!");
15799
15800 int ShuffleCount = 128 / SrcElemTySz;
15801 SmallVector<int> ShuffleLanes;
15802 for (int i = 0; i < ShuffleCount; ++i)
15803 ShuffleLanes.push_back(i);
15804
15805 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15806 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15807 // call TBL & save the result in a vector of TBL results for combining later.
15809 while (ShuffleLanes.back() < NumElements) {
15810 Parts.push_back(Builder.CreateBitCast(
15811 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15812
15813 if (Parts.size() == 4) {
15815 Intrinsic::aarch64_neon_tbl4, VecTy);
15816 Parts.push_back(ConstantVector::get(MaskConst));
15817 Results.push_back(Builder.CreateCall(F, Parts));
15818 Parts.clear();
15819 }
15820
15821 for (int i = 0; i < ShuffleCount; ++i)
15822 ShuffleLanes[i] += ShuffleCount;
15823 }
15824
15825 assert((Parts.empty() || Results.empty()) &&
15826 "Lowering trunc for vectors requiring different TBL instructions is "
15827 "not supported!");
15828 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15829 // registers
15830 if (!Parts.empty()) {
15831 Intrinsic::ID TblID;
15832 switch (Parts.size()) {
15833 case 1:
15834 TblID = Intrinsic::aarch64_neon_tbl1;
15835 break;
15836 case 2:
15837 TblID = Intrinsic::aarch64_neon_tbl2;
15838 break;
15839 case 3:
15840 TblID = Intrinsic::aarch64_neon_tbl3;
15841 break;
15842 }
15843
15844 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15845 Parts.push_back(ConstantVector::get(MaskConst));
15846 Results.push_back(Builder.CreateCall(F, Parts));
15847 }
15848
15849 // Extract the destination vector from TBL result(s) after combining them
15850 // where applicable. Currently, at most two TBLs are supported.
15851 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15852 "more than 2 tbl instructions!");
15853 Value *FinalResult = Results[0];
15854 if (Results.size() == 1) {
15855 if (ElemsPerTbl < 16) {
15856 SmallVector<int> FinalMask(ElemsPerTbl);
15857 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15858 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15859 }
15860 } else {
15861 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15862 if (ElemsPerTbl < 16) {
15863 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15864 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15865 } else {
15866 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15867 }
15868 FinalResult =
15869 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15870 }
15871
15872 TI->replaceAllUsesWith(FinalResult);
15873 TI->eraseFromParent();
15874}
15875
15877 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15878 // shuffle_vector instructions are serialized when targeting SVE,
15879 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15880 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15881 return false;
15882
15883 // Try to optimize conversions using tbl. This requires materializing constant
15884 // index vectors, which can increase code size and add loads. Skip the
15885 // transform unless the conversion is in a loop block guaranteed to execute
15886 // and we are not optimizing for size.
15887 Function *F = I->getParent()->getParent();
15888 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15889 F->hasOptSize())
15890 return false;
15891
15892 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15893 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15894 if (!SrcTy || !DstTy)
15895 return false;
15896
15897 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15898 // lowered to tbl instructions to insert the original i8 elements
15899 // into i8x lanes. This is enabled for cases where it is beneficial.
15900 auto *ZExt = dyn_cast<ZExtInst>(I);
15901 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15902 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15903 if (DstWidth % 8 != 0)
15904 return false;
15905
15906 auto *TruncDstType =
15907 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15908 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15909 // the remaining ZExt folded into the user, don't use tbl lowering.
15910 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15911 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15914 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15915 return false;
15916
15917 DstTy = TruncDstType;
15918 }
15919
15920 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15921 }
15922
15923 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15924 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15925 DstTy->getElementType()->isFloatTy()) {
15926 IRBuilder<> Builder(I);
15927 auto *ZExt = cast<ZExtInst>(
15928 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15929 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15930 I->replaceAllUsesWith(UI);
15931 I->eraseFromParent();
15932 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15933 Subtarget->isLittleEndian());
15934 }
15935
15936 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15937 // followed by a truncate lowered to using tbl.4.
15938 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15939 if (FPToUI &&
15940 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15941 SrcTy->getElementType()->isFloatTy() &&
15942 DstTy->getElementType()->isIntegerTy(8)) {
15943 IRBuilder<> Builder(I);
15944 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15945 VectorType::getInteger(SrcTy));
15946 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15947 I->replaceAllUsesWith(TruncI);
15948 I->eraseFromParent();
15949 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15950 return true;
15951 }
15952
15953 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15954 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15955 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15956 // registers
15957 auto *TI = dyn_cast<TruncInst>(I);
15958 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15959 ((SrcTy->getElementType()->isIntegerTy(32) ||
15960 SrcTy->getElementType()->isIntegerTy(64)) &&
15961 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15962 createTblForTrunc(TI, Subtarget->isLittleEndian());
15963 return true;
15964 }
15965
15966 return false;
15967}
15968
15970 Align &RequiredAligment) const {
15971 if (!LoadedType.isSimple() ||
15972 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15973 return false;
15974 // Cyclone supports unaligned accesses.
15975 RequiredAligment = Align(1);
15976 unsigned NumBits = LoadedType.getSizeInBits();
15977 return NumBits == 32 || NumBits == 64;
15978}
15979
15980/// A helper function for determining the number of interleaved accesses we
15981/// will generate when lowering accesses of the given type.
15983 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15984 unsigned VecSize = 128;
15985 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15986 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15987 if (UseScalable && isa<FixedVectorType>(VecTy))
15988 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15989 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15990}
15991
15994 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15995 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15996 return MOStridedAccess;
15998}
15999
16001 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
16002 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16003 auto EC = VecTy->getElementCount();
16004 unsigned MinElts = EC.getKnownMinValue();
16005
16006 UseScalable = false;
16007
16008 if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
16009 !Subtarget->useSVEForFixedLengthVectors())
16010 return false;
16011
16012 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
16013 return false;
16014
16015 // Ensure that the predicate for this number of elements is available.
16016 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
16017 return false;
16018
16019 // Ensure the number of vector elements is greater than 1.
16020 if (MinElts < 2)
16021 return false;
16022
16023 // Ensure the element type is legal.
16024 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
16025 return false;
16026
16027 if (EC.isScalable()) {
16028 UseScalable = true;
16029 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16030 }
16031
16032 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16033 if (Subtarget->useSVEForFixedLengthVectors()) {
16034 unsigned MinSVEVectorSize =
16035 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16036 if (VecSize % MinSVEVectorSize == 0 ||
16037 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16038 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16039 UseScalable = true;
16040 return true;
16041 }
16042 }
16043
16044 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16045 // 128 will be split into multiple interleaved accesses.
16046 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16047}
16048
16050 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16051 return ScalableVectorType::get(VTy->getElementType(), 2);
16052
16053 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16054 return ScalableVectorType::get(VTy->getElementType(), 4);
16055
16056 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16057 return ScalableVectorType::get(VTy->getElementType(), 8);
16058
16059 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16060 return ScalableVectorType::get(VTy->getElementType(), 8);
16061
16062 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16063 return ScalableVectorType::get(VTy->getElementType(), 2);
16064
16065 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16066 return ScalableVectorType::get(VTy->getElementType(), 4);
16067
16068 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16069 return ScalableVectorType::get(VTy->getElementType(), 8);
16070
16071 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16072 return ScalableVectorType::get(VTy->getElementType(), 16);
16073
16074 llvm_unreachable("Cannot handle input vector type");
16075}
16076
16077static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16078 bool Scalable, Type *LDVTy,
16079 Type *PtrTy) {
16080 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16081 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16082 Intrinsic::aarch64_sve_ld3_sret,
16083 Intrinsic::aarch64_sve_ld4_sret};
16084 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16085 Intrinsic::aarch64_neon_ld3,
16086 Intrinsic::aarch64_neon_ld4};
16087 if (Scalable)
16088 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16089
16090 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16091}
16092
16093static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16094 bool Scalable, Type *STVTy,
16095 Type *PtrTy) {
16096 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16097 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16098 Intrinsic::aarch64_sve_st3,
16099 Intrinsic::aarch64_sve_st4};
16100 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16101 Intrinsic::aarch64_neon_st3,
16102 Intrinsic::aarch64_neon_st4};
16103 if (Scalable)
16104 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16105
16106 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16107}
16108
16109/// Lower an interleaved load into a ldN intrinsic.
16110///
16111/// E.g. Lower an interleaved load (Factor = 2):
16112/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16113/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16114/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16115///
16116/// Into:
16117/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16118/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16119/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16122 ArrayRef<unsigned> Indices, unsigned Factor) const {
16123 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16124 "Invalid interleave factor");
16125 assert(!Shuffles.empty() && "Empty shufflevector input");
16126 assert(Shuffles.size() == Indices.size() &&
16127 "Unmatched number of shufflevectors and indices");
16128
16129 const DataLayout &DL = LI->getModule()->getDataLayout();
16130
16131 VectorType *VTy = Shuffles[0]->getType();
16132
16133 // Skip if we do not have NEON and skip illegal vector types. We can
16134 // "legalize" wide vector types into multiple interleaved accesses as long as
16135 // the vector types are divisible by 128.
16136 bool UseScalable;
16137 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16138 return false;
16139
16140 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16141
16142 auto *FVTy = cast<FixedVectorType>(VTy);
16143
16144 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16145 // load integer vectors first and then convert to pointer vectors.
16146 Type *EltTy = FVTy->getElementType();
16147 if (EltTy->isPointerTy())
16148 FVTy =
16149 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16150
16151 // If we're going to generate more than one load, reset the sub-vector type
16152 // to something legal.
16153 FVTy = FixedVectorType::get(FVTy->getElementType(),
16154 FVTy->getNumElements() / NumLoads);
16155
16156 auto *LDVTy =
16157 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16158
16159 IRBuilder<> Builder(LI);
16160
16161 // The base address of the load.
16162 Value *BaseAddr = LI->getPointerOperand();
16163
16164 Type *PtrTy = LI->getPointerOperandType();
16165 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16166 LDVTy->getElementCount());
16167
16168 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16169 UseScalable, LDVTy, PtrTy);
16170
16171 // Holds sub-vectors extracted from the load intrinsic return values. The
16172 // sub-vectors are associated with the shufflevector instructions they will
16173 // replace.
16175
16176 Value *PTrue = nullptr;
16177 if (UseScalable) {
16178 std::optional<unsigned> PgPattern =
16179 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16180 if (Subtarget->getMinSVEVectorSizeInBits() ==
16181 Subtarget->getMaxSVEVectorSizeInBits() &&
16182 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16183 PgPattern = AArch64SVEPredPattern::all;
16184
16185 auto *PTruePat =
16186 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16187 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16188 {PTruePat});
16189 }
16190
16191 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16192
16193 // If we're generating more than one load, compute the base address of
16194 // subsequent loads as an offset from the previous.
16195 if (LoadCount > 0)
16196 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16197 FVTy->getNumElements() * Factor);
16198
16199 CallInst *LdN;
16200 if (UseScalable)
16201 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16202 else
16203 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16204
16205 // Extract and store the sub-vectors returned by the load intrinsic.
16206 for (unsigned i = 0; i < Shuffles.size(); i++) {
16207 ShuffleVectorInst *SVI = Shuffles[i];
16208 unsigned Index = Indices[i];
16209
16210 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16211
16212 if (UseScalable)
16213 SubVec = Builder.CreateExtractVector(
16214 FVTy, SubVec,
16215 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16216
16217 // Convert the integer vector to pointer vector if the element is pointer.
16218 if (EltTy->isPointerTy())
16219 SubVec = Builder.CreateIntToPtr(
16221 FVTy->getNumElements()));
16222
16223 SubVecs[SVI].push_back(SubVec);
16224 }
16225 }
16226
16227 // Replace uses of the shufflevector instructions with the sub-vectors
16228 // returned by the load intrinsic. If a shufflevector instruction is
16229 // associated with more than one sub-vector, those sub-vectors will be
16230 // concatenated into a single wide vector.
16231 for (ShuffleVectorInst *SVI : Shuffles) {
16232 auto &SubVec = SubVecs[SVI];
16233 auto *WideVec =
16234 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16235 SVI->replaceAllUsesWith(WideVec);
16236 }
16237
16238 return true;
16239}
16240
16241template <typename Iter>
16242bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16243 int MaxLookupDist = 20;
16244 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16245 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16246 const Value *PtrA1 =
16247 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16248
16249 while (++It != End) {
16250 if (It->isDebugOrPseudoInst())
16251 continue;
16252 if (MaxLookupDist-- == 0)
16253 break;
16254 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16255 const Value *PtrB1 =
16256 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16257 DL, OffsetB);
16258 if (PtrA1 == PtrB1 &&
16259 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16260 .abs() == 16)
16261 return true;
16262 }
16263 }
16264
16265 return false;
16266}
16267
16268/// Lower an interleaved store into a stN intrinsic.
16269///
16270/// E.g. Lower an interleaved store (Factor = 3):
16271/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16272/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16273/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16274///
16275/// Into:
16276/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16277/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16278/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16279/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16280///
16281/// Note that the new shufflevectors will be removed and we'll only generate one
16282/// st3 instruction in CodeGen.
16283///
16284/// Example for a more general valid mask (Factor 3). Lower:
16285/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16286/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16287/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16288///
16289/// Into:
16290/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16291/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16292/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16293/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16295 ShuffleVectorInst *SVI,
16296 unsigned Factor) const {
16297
16298 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16299 "Invalid interleave factor");
16300
16301 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16302 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16303
16304 unsigned LaneLen = VecTy->getNumElements() / Factor;
16305 Type *EltTy = VecTy->getElementType();
16306 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16307
16308 const DataLayout &DL = SI->getModule()->getDataLayout();
16309 bool UseScalable;
16310
16311 // Skip if we do not have NEON and skip illegal vector types. We can
16312 // "legalize" wide vector types into multiple interleaved accesses as long as
16313 // the vector types are divisible by 128.
16314 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16315 return false;
16316
16317 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16318
16319 Value *Op0 = SVI->getOperand(0);
16320 Value *Op1 = SVI->getOperand(1);
16321 IRBuilder<> Builder(SI);
16322
16323 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16324 // vectors to integer vectors.
16325 if (EltTy->isPointerTy()) {
16326 Type *IntTy = DL.getIntPtrType(EltTy);
16327 unsigned NumOpElts =
16328 cast<FixedVectorType>(Op0->getType())->getNumElements();
16329
16330 // Convert to the corresponding integer vector.
16331 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16332 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16333 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16334
16335 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16336 }
16337
16338 // If we're going to generate more than one store, reset the lane length
16339 // and sub-vector type to something legal.
16340 LaneLen /= NumStores;
16341 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16342
16343 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16344 : SubVecTy;
16345
16346 // The base address of the store.
16347 Value *BaseAddr = SI->getPointerOperand();
16348
16349 auto Mask = SVI->getShuffleMask();
16350
16351 // Sanity check if all the indices are NOT in range.
16352 // If mask is `poison`, `Mask` may be a vector of -1s.
16353 // If all of them are `poison`, OOB read will happen later.
16354 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16355 return false;
16356 }
16357 // A 64bit st2 which does not start at element 0 will involved adding extra
16358 // ext elements making the st2 unprofitable, and if there is a nearby store
16359 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16360 // zip;ldp pair which has higher throughput.
16361 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16362 (Mask[0] != 0 ||
16363 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16364 DL) ||
16365 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16366 BaseAddr, DL)))
16367 return false;
16368
16369 Type *PtrTy = SI->getPointerOperandType();
16370 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16371 STVTy->getElementCount());
16372
16373 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16374 UseScalable, STVTy, PtrTy);
16375
16376 Value *PTrue = nullptr;
16377 if (UseScalable) {
16378 std::optional<unsigned> PgPattern =
16379 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16380 if (Subtarget->getMinSVEVectorSizeInBits() ==
16381 Subtarget->getMaxSVEVectorSizeInBits() &&
16382 Subtarget->getMinSVEVectorSizeInBits() ==
16383 DL.getTypeSizeInBits(SubVecTy))
16384 PgPattern = AArch64SVEPredPattern::all;
16385
16386 auto *PTruePat =
16387 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16388 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16389 {PTruePat});
16390 }
16391
16392 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16393
16395
16396 // Split the shufflevector operands into sub vectors for the new stN call.
16397 for (unsigned i = 0; i < Factor; i++) {
16398 Value *Shuffle;
16399 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16400 if (Mask[IdxI] >= 0) {
16401 Shuffle = Builder.CreateShuffleVector(
16402 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16403 } else {
16404 unsigned StartMask = 0;
16405 for (unsigned j = 1; j < LaneLen; j++) {
16406 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16407 if (Mask[IdxJ] >= 0) {
16408 StartMask = Mask[IdxJ] - j;
16409 break;
16410 }
16411 }
16412 // Note: Filling undef gaps with random elements is ok, since
16413 // those elements were being written anyway (with undefs).
16414 // In the case of all undefs we're defaulting to using elems from 0
16415 // Note: StartMask cannot be negative, it's checked in
16416 // isReInterleaveMask
16417 Shuffle = Builder.CreateShuffleVector(
16418 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16419 }
16420
16421 if (UseScalable)
16422 Shuffle = Builder.CreateInsertVector(
16423 STVTy, UndefValue::get(STVTy), Shuffle,
16424 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16425
16426 Ops.push_back(Shuffle);
16427 }
16428
16429 if (UseScalable)
16430 Ops.push_back(PTrue);
16431
16432 // If we generating more than one store, we compute the base address of
16433 // subsequent stores as an offset from the previous.
16434 if (StoreCount > 0)
16435 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16436 BaseAddr, LaneLen * Factor);
16437
16438 Ops.push_back(BaseAddr);
16439 Builder.CreateCall(StNFunc, Ops);
16440 }
16441 return true;
16442}
16443
16445 IntrinsicInst *DI, LoadInst *LI) const {
16446 // Only deinterleave2 supported at present.
16447 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16448 return false;
16449
16450 // Only a factor of 2 supported at present.
16451 const unsigned Factor = 2;
16452
16453 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16454 const DataLayout &DL = DI->getModule()->getDataLayout();
16455 bool UseScalable;
16456 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16457 return false;
16458
16459 // TODO: Add support for using SVE instructions with fixed types later, using
16460 // the code from lowerInterleavedLoad to obtain the correct container type.
16461 if (UseScalable && !VTy->isScalableTy())
16462 return false;
16463
16464 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16465
16466 VectorType *LdTy =
16468 VTy->getElementCount().divideCoefficientBy(NumLoads));
16469
16470 Type *PtrTy = LI->getPointerOperandType();
16471 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16472 UseScalable, LdTy, PtrTy);
16473
16474 IRBuilder<> Builder(LI);
16475
16476 Value *Pred = nullptr;
16477 if (UseScalable)
16478 Pred =
16479 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16480
16481 Value *BaseAddr = LI->getPointerOperand();
16482 Value *Result;
16483 if (NumLoads > 1) {
16484 Value *Left = PoisonValue::get(VTy);
16486
16487 for (unsigned I = 0; I < NumLoads; ++I) {
16488 Value *Offset = Builder.getInt64(I * Factor);
16489
16490 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16491 Value *LdN = nullptr;
16492 if (UseScalable)
16493 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16494 else
16495 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16496
16497 Value *Idx =
16498 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16499 Left = Builder.CreateInsertVector(
16500 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16501 Right = Builder.CreateInsertVector(
16502 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16503 }
16504
16505 Result = PoisonValue::get(DI->getType());
16506 Result = Builder.CreateInsertValue(Result, Left, 0);
16507 Result = Builder.CreateInsertValue(Result, Right, 1);
16508 } else {
16509 if (UseScalable)
16510 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16511 else
16512 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16513 }
16514
16515 DI->replaceAllUsesWith(Result);
16516 return true;
16517}
16518
16520 IntrinsicInst *II, StoreInst *SI) const {
16521 // Only interleave2 supported at present.
16522 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16523 return false;
16524
16525 // Only a factor of 2 supported at present.
16526 const unsigned Factor = 2;
16527
16528 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16529 const DataLayout &DL = II->getModule()->getDataLayout();
16530 bool UseScalable;
16531 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16532 return false;
16533
16534 // TODO: Add support for using SVE instructions with fixed types later, using
16535 // the code from lowerInterleavedStore to obtain the correct container type.
16536 if (UseScalable && !VTy->isScalableTy())
16537 return false;
16538
16539 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16540
16541 VectorType *StTy =
16543 VTy->getElementCount().divideCoefficientBy(NumStores));
16544
16545 Type *PtrTy = SI->getPointerOperandType();
16546 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16547 UseScalable, StTy, PtrTy);
16548
16549 IRBuilder<> Builder(SI);
16550
16551 Value *BaseAddr = SI->getPointerOperand();
16552 Value *Pred = nullptr;
16553
16554 if (UseScalable)
16555 Pred =
16556 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16557
16558 Value *L = II->getOperand(0);
16559 Value *R = II->getOperand(1);
16560
16561 for (unsigned I = 0; I < NumStores; ++I) {
16562 Value *Address = BaseAddr;
16563 if (NumStores > 1) {
16564 Value *Offset = Builder.getInt64(I * Factor);
16565 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16566
16567 Value *Idx =
16568 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16569 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16570 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16571 }
16572
16573 if (UseScalable)
16574 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16575 else
16576 Builder.CreateCall(StNFunc, {L, R, Address});
16577 }
16578
16579 return true;
16580}
16581
16583 const MemOp &Op, const AttributeList &FuncAttributes) const {
16584 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16585 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16586 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16587 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16588 // taken one instruction to materialize the v2i64 zero and one store (with
16589 // restrictive addressing mode). Just do i64 stores.
16590 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16591 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16592 if (Op.isAligned(AlignCheck))
16593 return true;
16594 unsigned Fast;
16595 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16597 Fast;
16598 };
16599
16600 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16601 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16602 return MVT::v16i8;
16603 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16604 return MVT::f128;
16605 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16606 return MVT::i64;
16607 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16608 return MVT::i32;
16609 return MVT::Other;
16610}
16611
16613 const MemOp &Op, const AttributeList &FuncAttributes) const {
16614 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16615 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16616 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16617 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16618 // taken one instruction to materialize the v2i64 zero and one store (with
16619 // restrictive addressing mode). Just do i64 stores.
16620 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16621 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16622 if (Op.isAligned(AlignCheck))
16623 return true;
16624 unsigned Fast;
16625 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16627 Fast;
16628 };
16629
16630 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16631 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16632 return LLT::fixed_vector(2, 64);
16633 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16634 return LLT::scalar(128);
16635 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16636 return LLT::scalar(64);
16637 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16638 return LLT::scalar(32);
16639 return LLT();
16640}
16641
16642// 12-bit optionally shifted immediates are legal for adds.
16644 if (Immed == std::numeric_limits<int64_t>::min()) {
16645 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16646 << ": avoid UB for INT64_MIN\n");
16647 return false;
16648 }
16649 // Same encoding for add/sub, just flip the sign.
16650 Immed = std::abs(Immed);
16651 bool IsLegal = ((Immed >> 12) == 0 ||
16652 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16653 LLVM_DEBUG(dbgs() << "Is " << Immed
16654 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16655 return IsLegal;
16656}
16657
16659 // We will only emit addvl/inc* instructions for SVE2
16660 if (!Subtarget->hasSVE2())
16661 return false;
16662
16663 // addvl's immediates are in terms of the number of bytes in a register.
16664 // Since there are 16 in the base supported size (128bits), we need to
16665 // divide the immediate by that much to give us a useful immediate to
16666 // multiply by vscale. We can't have a remainder as a result of this.
16667 if (Imm % 16 == 0)
16668 return isInt<6>(Imm / 16);
16669
16670 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16671 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16672 // of addvl as a result, so only take h|w|d into account.
16673 // Dec[h|w|d] will cover subtractions.
16674 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16675 // FIXME: Can we make use of other patterns to cover other immediates?
16676
16677 // inch|dech
16678 if (Imm % 8 == 0)
16679 return std::abs(Imm / 8) <= 16;
16680 // incw|decw
16681 if (Imm % 4 == 0)
16682 return std::abs(Imm / 4) <= 16;
16683 // incd|decd
16684 if (Imm % 2 == 0)
16685 return std::abs(Imm / 2) <= 16;
16686
16687 return false;
16688}
16689
16690// Return false to prevent folding
16691// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16692// if the folding leads to worse code.
16694 SDValue AddNode, SDValue ConstNode) const {
16695 // Let the DAGCombiner decide for vector types and large types.
16696 const EVT VT = AddNode.getValueType();
16697 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16698 return true;
16699
16700 // It is worse if c1 is legal add immediate, while c1*c2 is not
16701 // and has to be composed by at least two instructions.
16702 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16703 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16704 const int64_t C1 = C1Node->getSExtValue();
16705 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16707 return true;
16709 // Adapt to the width of a register.
16710 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16712 if (Insn.size() > 1)
16713 return false;
16714
16715 // Default to true and let the DAGCombiner decide.
16716 return true;
16717}
16718
16719// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16720// immediates is the same as for an add or a sub.
16722 return isLegalAddImmediate(Immed);
16723}
16724
16725/// isLegalAddressingMode - Return true if the addressing mode represented
16726/// by AM is legal for this target, for a load/store of the specified type.
16728 const AddrMode &AMode, Type *Ty,
16729 unsigned AS, Instruction *I) const {
16730 // AArch64 has five basic addressing modes:
16731 // reg
16732 // reg + 9-bit signed offset
16733 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16734 // reg1 + reg2
16735 // reg + SIZE_IN_BYTES * reg
16736
16737 // No global is ever allowed as a base.
16738 if (AMode.BaseGV)
16739 return false;
16740
16741 // No reg+reg+imm addressing.
16742 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16743 return false;
16744
16745 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16746 // `2*ScaledReg` into `BaseReg + ScaledReg`
16747 AddrMode AM = AMode;
16748 if (AM.Scale && !AM.HasBaseReg) {
16749 if (AM.Scale == 1) {
16750 AM.HasBaseReg = true;
16751 AM.Scale = 0;
16752 } else if (AM.Scale == 2) {
16753 AM.HasBaseReg = true;
16754 AM.Scale = 1;
16755 } else {
16756 return false;
16757 }
16758 }
16759
16760 // A base register is required in all addressing modes.
16761 if (!AM.HasBaseReg)
16762 return false;
16763
16764 if (Ty->isScalableTy()) {
16765 if (isa<ScalableVectorType>(Ty)) {
16766 // See if we have a foldable vscale-based offset, for vector types which
16767 // are either legal or smaller than the minimum; more work will be
16768 // required if we need to consider addressing for types which need
16769 // legalization by splitting.
16770 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16771 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16772 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16773 isPowerOf2_64(VecNumBytes))
16774 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16775
16776 uint64_t VecElemNumBytes =
16777 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16778 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16779 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16780 }
16781
16782 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16783 }
16784
16785 // No scalable offsets allowed for non-scalable types.
16786 if (AM.ScalableOffset)
16787 return false;
16788
16789 // check reg + imm case:
16790 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16791 uint64_t NumBytes = 0;
16792 if (Ty->isSized()) {
16793 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16794 NumBytes = NumBits / 8;
16795 if (!isPowerOf2_64(NumBits))
16796 NumBytes = 0;
16797 }
16798
16799 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16800 AM.Scale);
16801}
16802
16803// Check whether the 2 offsets belong to the same imm24 range, and their high
16804// 12bits are same, then their high part can be decoded with the offset of add.
16805int64_t
16807 int64_t MaxOffset) const {
16808 int64_t HighPart = MinOffset & ~0xfffULL;
16809 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16810 // Rebase the value to an integer multiple of imm12.
16811 return HighPart;
16812 }
16813
16814 return 0;
16815}
16816
16818 // Consider splitting large offset of struct or array.
16819 return true;
16820}
16821
16823 const MachineFunction &MF, EVT VT) const {
16824 VT = VT.getScalarType();
16825
16826 if (!VT.isSimple())
16827 return false;
16828
16829 switch (VT.getSimpleVT().SimpleTy) {
16830 case MVT::f16:
16831 return Subtarget->hasFullFP16();
16832 case MVT::f32:
16833 case MVT::f64:
16834 return true;
16835 default:
16836 break;
16837 }
16838
16839 return false;
16840}
16841
16843 Type *Ty) const {
16844 switch (Ty->getScalarType()->getTypeID()) {
16845 case Type::FloatTyID:
16846 case Type::DoubleTyID:
16847 return true;
16848 default:
16849 return false;
16850 }
16851}
16852
16854 EVT VT, CodeGenOptLevel OptLevel) const {
16855 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16857}
16858
16859const MCPhysReg *
16861 // LR is a callee-save register, but we must treat it as clobbered by any call
16862 // site. Hence we include LR in the scratch registers, which are in turn added
16863 // as implicit-defs for stackmaps and patchpoints.
16864 static const MCPhysReg ScratchRegs[] = {
16865 AArch64::X16, AArch64::X17, AArch64::LR, 0
16866 };
16867 return ScratchRegs;
16868}
16869
16871 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16872 return RCRegs;
16873}
16874
16875bool
16877 CombineLevel Level) const {
16878 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16879 N->getOpcode() == ISD::SRL) &&
16880 "Expected shift op");
16881
16882 SDValue ShiftLHS = N->getOperand(0);
16883 EVT VT = N->getValueType(0);
16884
16885 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16886 // combine it with shift 'N' to let it be lowered to UBFX except:
16887 // ((x >> C) & mask) << C.
16888 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16889 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16890 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16891 if (isMask_64(TruncMask)) {
16892 SDValue AndLHS = ShiftLHS.getOperand(0);
16893 if (AndLHS.getOpcode() == ISD::SRL) {
16894 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16895 if (N->getOpcode() == ISD::SHL)
16896 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16897 return SRLC->getZExtValue() == SHLC->getZExtValue();
16898 return false;
16899 }
16900 }
16901 }
16902 }
16903 return true;
16904}
16905
16907 const SDNode *N) const {
16908 assert(N->getOpcode() == ISD::XOR &&
16909 (N->getOperand(0).getOpcode() == ISD::SHL ||
16910 N->getOperand(0).getOpcode() == ISD::SRL) &&
16911 "Expected XOR(SHIFT) pattern");
16912
16913 // Only commute if the entire NOT mask is a hidden shifted mask.
16914 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16915 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16916 if (XorC && ShiftC) {
16917 unsigned MaskIdx, MaskLen;
16918 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16919 unsigned ShiftAmt = ShiftC->getZExtValue();
16920 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16921 if (N->getOperand(0).getOpcode() == ISD::SHL)
16922 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16923 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16924 }
16925 }
16926
16927 return false;
16928}
16929
16931 const SDNode *N, CombineLevel Level) const {
16932 assert(((N->getOpcode() == ISD::SHL &&
16933 N->getOperand(0).getOpcode() == ISD::SRL) ||
16934 (N->getOpcode() == ISD::SRL &&
16935 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16936 "Expected shift-shift mask");
16937 // Don't allow multiuse shift folding with the same shift amount.
16938 if (!N->getOperand(0)->hasOneUse())
16939 return false;
16940
16941 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16942 EVT VT = N->getValueType(0);
16943 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16944 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16945 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16946 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16947 }
16948
16949 return true;
16950}
16951
16953 unsigned BinOpcode, EVT VT) const {
16954 return VT.isScalableVector() && isTypeLegal(VT);
16955}
16956
16958 Type *Ty) const {
16959 assert(Ty->isIntegerTy());
16960
16961 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16962 if (BitSize == 0)
16963 return false;
16964
16965 int64_t Val = Imm.getSExtValue();
16966 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16967 return true;
16968
16969 if ((int64_t)Val < 0)
16970 Val = ~Val;
16971 if (BitSize == 32)
16972 Val &= (1LL << 32) - 1;
16973
16974 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16975 // MOVZ is free so return true for one or fewer MOVK.
16976 return Shift < 3;
16977}
16978
16980 unsigned Index) const {
16982 return false;
16983
16984 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16985}
16986
16987/// Turn vector tests of the signbit in the form of:
16988/// xor (sra X, elt_size(X)-1), -1
16989/// into:
16990/// cmge X, X, #0
16992 const AArch64Subtarget *Subtarget) {
16993 EVT VT = N->getValueType(0);
16994 if (!Subtarget->hasNEON() || !VT.isVector())
16995 return SDValue();
16996
16997 // There must be a shift right algebraic before the xor, and the xor must be a
16998 // 'not' operation.
16999 SDValue Shift = N->getOperand(0);
17000 SDValue Ones = N->getOperand(1);
17001 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
17003 return SDValue();
17004
17005 // The shift should be smearing the sign bit across each vector element.
17006 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
17007 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
17008 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
17009 return SDValue();
17010
17011 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
17012}
17013
17014// Given a vecreduce_add node, detect the below pattern and convert it to the
17015// node sequence with UABDL, [S|U]ADB and UADDLP.
17016//
17017// i32 vecreduce_add(
17018// v16i32 abs(
17019// v16i32 sub(
17020// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
17021// =================>
17022// i32 vecreduce_add(
17023// v4i32 UADDLP(
17024// v8i16 add(
17025// v8i16 zext(
17026// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
17027// v8i16 zext(
17028// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17030 SelectionDAG &DAG) {
17031 // Assumed i32 vecreduce_add
17032 if (N->getValueType(0) != MVT::i32)
17033 return SDValue();
17034
17035 SDValue VecReduceOp0 = N->getOperand(0);
17036 unsigned Opcode = VecReduceOp0.getOpcode();
17037 // Assumed v16i32 abs
17038 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17039 return SDValue();
17040
17041 SDValue ABS = VecReduceOp0;
17042 // Assumed v16i32 sub
17043 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17044 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17045 return SDValue();
17046
17047 SDValue SUB = ABS->getOperand(0);
17048 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17049 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17050 // Assumed v16i32 type
17051 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17052 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17053 return SDValue();
17054
17055 // Assumed zext or sext
17056 bool IsZExt = false;
17057 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17058 IsZExt = true;
17059 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17060 IsZExt = false;
17061 } else
17062 return SDValue();
17063
17064 SDValue EXT0 = SUB->getOperand(0);
17065 SDValue EXT1 = SUB->getOperand(1);
17066 // Assumed zext's operand has v16i8 type
17067 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17068 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17069 return SDValue();
17070
17071 // Pattern is dectected. Let's convert it to sequence of nodes.
17072 SDLoc DL(N);
17073
17074 // First, create the node pattern of UABD/SABD.
17075 SDValue UABDHigh8Op0 =
17076 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17077 DAG.getConstant(8, DL, MVT::i64));
17078 SDValue UABDHigh8Op1 =
17079 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17080 DAG.getConstant(8, DL, MVT::i64));
17081 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17082 UABDHigh8Op0, UABDHigh8Op1);
17083 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17084
17085 // Second, create the node pattern of UABAL.
17086 SDValue UABDLo8Op0 =
17087 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17088 DAG.getConstant(0, DL, MVT::i64));
17089 SDValue UABDLo8Op1 =
17090 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17091 DAG.getConstant(0, DL, MVT::i64));
17092 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17093 UABDLo8Op0, UABDLo8Op1);
17094 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17095 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17096
17097 // Third, create the node of UADDLP.
17098 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17099
17100 // Fourth, create the node of VECREDUCE_ADD.
17101 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17102}
17103
17104// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17105// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17106// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17107// If we have vectors larger than v16i8 we extract v16i8 vectors,
17108// Follow the same steps above to get DOT instructions concatenate them
17109// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17111 const AArch64Subtarget *ST) {
17112 if (!ST->hasDotProd())
17114
17115 SDValue Op0 = N->getOperand(0);
17116 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17117 Op0.getValueType().getVectorElementType() != MVT::i32)
17118 return SDValue();
17119
17120 unsigned ExtOpcode = Op0.getOpcode();
17121 SDValue A = Op0;
17122 SDValue B;
17123 if (ExtOpcode == ISD::MUL) {
17124 A = Op0.getOperand(0);
17125 B = Op0.getOperand(1);
17126 if (A.getOpcode() != B.getOpcode() ||
17127 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17128 return SDValue();
17129 ExtOpcode = A.getOpcode();
17130 }
17131 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17132 return SDValue();
17133
17134 EVT Op0VT = A.getOperand(0).getValueType();
17135 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17136 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17137 if (!IsValidElementCount || !IsValidSize)
17138 return SDValue();
17139
17140 SDLoc DL(Op0);
17141 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17142 // the extend B.
17143 if (!B)
17144 B = DAG.getConstant(1, DL, Op0VT);
17145 else
17146 B = B.getOperand(0);
17147
17148 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17149 unsigned NumOfVecReduce;
17150 EVT TargetType;
17151 if (IsMultipleOf16) {
17152 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17153 TargetType = MVT::v4i32;
17154 } else {
17155 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17156 TargetType = MVT::v2i32;
17157 }
17158 auto DotOpcode =
17160 // Handle the case where we need to generate only one Dot operation.
17161 if (NumOfVecReduce == 1) {
17162 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17163 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17164 A.getOperand(0), B);
17165 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17166 }
17167 // Generate Dot instructions that are multiple of 16.
17168 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17169 SmallVector<SDValue, 4> SDotVec16;
17170 unsigned I = 0;
17171 for (; I < VecReduce16Num; I += 1) {
17172 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17173 SDValue Op0 =
17174 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17175 DAG.getConstant(I * 16, DL, MVT::i64));
17176 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17177 DAG.getConstant(I * 16, DL, MVT::i64));
17178 SDValue Dot =
17179 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17180 SDotVec16.push_back(Dot);
17181 }
17182 // Concatenate dot operations.
17183 EVT SDot16EVT =
17184 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17185 SDValue ConcatSDot16 =
17186 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17187 SDValue VecReduceAdd16 =
17188 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17189 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17190 if (VecReduce8Num == 0)
17191 return VecReduceAdd16;
17192
17193 // Generate the remainder Dot operation that is multiple of 8.
17194 SmallVector<SDValue, 4> SDotVec8;
17195 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17196 SDValue Vec8Op0 =
17197 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17198 DAG.getConstant(I * 16, DL, MVT::i64));
17199 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17200 DAG.getConstant(I * 16, DL, MVT::i64));
17201 SDValue Dot =
17202 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17203 SDValue VecReudceAdd8 =
17204 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17205 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17206 VecReudceAdd8);
17207}
17208
17209// Given an (integer) vecreduce, we know the order of the inputs does not
17210// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17211// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17212// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17214 auto DetectAddExtract = [&](SDValue A) {
17215 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17216 // UADDLP(x) if found.
17217 assert(A.getOpcode() == ISD::ADD);
17218 EVT VT = A.getValueType();
17219 SDValue Op0 = A.getOperand(0);
17220 SDValue Op1 = A.getOperand(1);
17221 if (Op0.getOpcode() != Op0.getOpcode() ||
17222 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17223 Op0.getOpcode() != ISD::SIGN_EXTEND))
17224 return SDValue();
17225 SDValue Ext0 = Op0.getOperand(0);
17226 SDValue Ext1 = Op1.getOperand(0);
17227 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17229 Ext0.getOperand(0) != Ext1.getOperand(0))
17230 return SDValue();
17231 // Check that the type is twice the add types, and the extract are from
17232 // upper/lower parts of the same source.
17234 VT.getVectorNumElements() * 2)
17235 return SDValue();
17236 if ((Ext0.getConstantOperandVal(1) != 0 ||
17238 (Ext1.getConstantOperandVal(1) != 0 ||
17240 return SDValue();
17241 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17243 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17244 };
17245
17246 if (SDValue R = DetectAddExtract(A))
17247 return R;
17248
17249 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17250 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17251 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17252 A.getOperand(1));
17253 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17254 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17255 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17256 A.getOperand(0));
17257 return SDValue();
17258}
17259
17260// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17261// UADDLV(concat), where the concat represents the 64-bit zext sources.
17263 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17264 // UADDLV(concat(zext, zext)) if found.
17265 assert(A.getOpcode() == ISD::ADD);
17266 EVT VT = A.getValueType();
17267 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17268 return SDValue();
17269 SDValue Op0 = A.getOperand(0);
17270 SDValue Op1 = A.getOperand(1);
17271 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17272 return SDValue();
17273 SDValue Ext0 = Op0.getOperand(0);
17274 SDValue Ext1 = Op1.getOperand(0);
17275 EVT ExtVT0 = Ext0.getValueType();
17276 EVT ExtVT1 = Ext1.getValueType();
17277 // Check zext VTs are the same and 64-bit length.
17278 if (ExtVT0 != ExtVT1 ||
17279 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17280 return SDValue();
17281 // Get VT for concat of zext sources.
17282 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17283 SDValue Concat =
17284 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17285
17286 switch (VT.getSimpleVT().SimpleTy) {
17287 case MVT::v2i64:
17288 case MVT::v4i32:
17289 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17290 case MVT::v8i16: {
17291 SDValue Uaddlv =
17292 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17293 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17294 }
17295 default:
17296 llvm_unreachable("Unhandled vector type");
17297 }
17298}
17299
17301 SDValue A = N->getOperand(0);
17302 if (A.getOpcode() == ISD::ADD) {
17303 if (SDValue R = performUADDVAddCombine(A, DAG))
17304 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17305 else if (SDValue R = performUADDVZextCombine(A, DAG))
17306 return R;
17307 }
17308 return SDValue();
17309}
17310
17313 const AArch64Subtarget *Subtarget) {
17314 if (DCI.isBeforeLegalizeOps())
17315 return SDValue();
17316
17317 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17318}
17319
17320SDValue
17321AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17322 SelectionDAG &DAG,
17323 SmallVectorImpl<SDNode *> &Created) const {
17325 if (isIntDivCheap(N->getValueType(0), Attr))
17326 return SDValue(N,0); // Lower SDIV as SDIV
17327
17328 EVT VT = N->getValueType(0);
17329
17330 // For scalable and fixed types, mark them as cheap so we can handle it much
17331 // later. This allows us to handle larger than legal types.
17332 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17333 return SDValue(N, 0);
17334
17335 // fold (sdiv X, pow2)
17336 if ((VT != MVT::i32 && VT != MVT::i64) ||
17337 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17338 return SDValue();
17339
17340 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17341}
17342
17343SDValue
17344AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17345 SelectionDAG &DAG,
17346 SmallVectorImpl<SDNode *> &Created) const {
17348 if (isIntDivCheap(N->getValueType(0), Attr))
17349 return SDValue(N, 0); // Lower SREM as SREM
17350
17351 EVT VT = N->getValueType(0);
17352
17353 // For scalable and fixed types, mark them as cheap so we can handle it much
17354 // later. This allows us to handle larger than legal types.
17355 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17356 return SDValue(N, 0);
17357
17358 // fold (srem X, pow2)
17359 if ((VT != MVT::i32 && VT != MVT::i64) ||
17360 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17361 return SDValue();
17362
17363 unsigned Lg2 = Divisor.countr_zero();
17364 if (Lg2 == 0)
17365 return SDValue();
17366
17367 SDLoc DL(N);
17368 SDValue N0 = N->getOperand(0);
17369 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17370 SDValue Zero = DAG.getConstant(0, DL, VT);
17371 SDValue CCVal, CSNeg;
17372 if (Lg2 == 1) {
17373 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17374 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17375 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17376
17377 Created.push_back(Cmp.getNode());
17378 Created.push_back(And.getNode());
17379 } else {
17380 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17381 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17382
17383 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17384 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17385 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17386 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17387 Negs.getValue(1));
17388
17389 Created.push_back(Negs.getNode());
17390 Created.push_back(AndPos.getNode());
17391 Created.push_back(AndNeg.getNode());
17392 }
17393
17394 return CSNeg;
17395}
17396
17397static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17398 switch(getIntrinsicID(S.getNode())) {
17399 default:
17400 break;
17401 case Intrinsic::aarch64_sve_cntb:
17402 return 8;
17403 case Intrinsic::aarch64_sve_cnth:
17404 return 16;
17405 case Intrinsic::aarch64_sve_cntw:
17406 return 32;
17407 case Intrinsic::aarch64_sve_cntd:
17408 return 64;
17409 }
17410 return {};
17411}
17412
17413/// Calculates what the pre-extend type is, based on the extension
17414/// operation node provided by \p Extend.
17415///
17416/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17417/// pre-extend type is pulled directly from the operand, while other extend
17418/// operations need a bit more inspection to get this information.
17419///
17420/// \param Extend The SDNode from the DAG that represents the extend operation
17421///
17422/// \returns The type representing the \p Extend source type, or \p MVT::Other
17423/// if no valid type can be determined
17425 switch (Extend.getOpcode()) {
17426 case ISD::SIGN_EXTEND:
17427 case ISD::ZERO_EXTEND:
17428 return Extend.getOperand(0).getValueType();
17429 case ISD::AssertSext:
17430 case ISD::AssertZext:
17432 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17433 if (!TypeNode)
17434 return MVT::Other;
17435 return TypeNode->getVT();
17436 }
17437 case ISD::AND: {
17439 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17440 if (!Constant)
17441 return MVT::Other;
17442
17443 uint32_t Mask = Constant->getZExtValue();
17444
17445 if (Mask == UCHAR_MAX)
17446 return MVT::i8;
17447 else if (Mask == USHRT_MAX)
17448 return MVT::i16;
17449 else if (Mask == UINT_MAX)
17450 return MVT::i32;
17451
17452 return MVT::Other;
17453 }
17454 default:
17455 return MVT::Other;
17456 }
17457}
17458
17459/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17460/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17461/// SExt/ZExt rather than the scalar SExt/ZExt
17463 EVT VT = BV.getValueType();
17464 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17466 return SDValue();
17467
17468 // Use the first item in the buildvector/shuffle to get the size of the
17469 // extend, and make sure it looks valid.
17470 SDValue Extend = BV->getOperand(0);
17471 unsigned ExtendOpcode = Extend.getOpcode();
17472 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17473 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17474 ExtendOpcode == ISD::AssertSext;
17475 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17476 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17477 return SDValue();
17478 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17479 // calculatePreExtendType will work without issue.
17480 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17481 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17482 return SDValue();
17483
17484 // Restrict valid pre-extend data type
17485 EVT PreExtendType = calculatePreExtendType(Extend);
17486 if (PreExtendType == MVT::Other ||
17487 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17488 return SDValue();
17489
17490 // Make sure all other operands are equally extended
17491 for (SDValue Op : drop_begin(BV->ops())) {
17492 if (Op.isUndef())
17493 continue;
17494 unsigned Opc = Op.getOpcode();
17495 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17496 Opc == ISD::AssertSext;
17497 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17498 return SDValue();
17499 }
17500
17501 SDValue NBV;
17502 SDLoc DL(BV);
17503 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17504 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17505 EVT PreExtendLegalType =
17506 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17508 for (SDValue Op : BV->ops())
17509 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17510 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17511 PreExtendLegalType));
17512 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17513 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17514 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17515 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17516 BV.getOperand(1).isUndef()
17517 ? DAG.getUNDEF(PreExtendVT)
17518 : BV.getOperand(1).getOperand(0),
17519 cast<ShuffleVectorSDNode>(BV)->getMask());
17520 }
17521 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17522}
17523
17524/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17525/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17527 // If the value type isn't a vector, none of the operands are going to be dups
17528 EVT VT = Mul->getValueType(0);
17529 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17530 return SDValue();
17531
17532 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17533 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17534
17535 // Neither operands have been changed, don't make any further changes
17536 if (!Op0 && !Op1)
17537 return SDValue();
17538
17539 SDLoc DL(Mul);
17540 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17541 Op1 ? Op1 : Mul->getOperand(1));
17542}
17543
17544// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17545// Same for other types with equivalent constants.
17547 EVT VT = N->getValueType(0);
17548 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17549 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17550 return SDValue();
17551 if (N->getOperand(0).getOpcode() != ISD::AND ||
17552 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17553 return SDValue();
17554
17555 SDValue And = N->getOperand(0);
17556 SDValue Srl = And.getOperand(0);
17557
17558 APInt V1, V2, V3;
17559 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17560 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17562 return SDValue();
17563
17564 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17565 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17566 V3 != (HalfSize - 1))
17567 return SDValue();
17568
17569 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17570 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17571 VT.getVectorElementCount() * 2);
17572
17573 SDLoc DL(N);
17574 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17575 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17576 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17577}
17578
17581 const AArch64Subtarget *Subtarget) {
17582
17583 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17584 return Ext;
17586 return Ext;
17587
17588 if (DCI.isBeforeLegalizeOps())
17589 return SDValue();
17590
17591 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17592 // and in MachineCombiner pass, add+mul will be combined into madd.
17593 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17594 SDLoc DL(N);
17595 EVT VT = N->getValueType(0);
17596 SDValue N0 = N->getOperand(0);
17597 SDValue N1 = N->getOperand(1);
17598 SDValue MulOper;
17599 unsigned AddSubOpc;
17600
17601 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17602 AddSubOpc = V->getOpcode();
17603 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17604 SDValue Opnd = V->getOperand(1);
17605 MulOper = V->getOperand(0);
17606 if (AddSubOpc == ISD::SUB)
17607 std::swap(Opnd, MulOper);
17608 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17609 return C->isOne();
17610 }
17611 return false;
17612 };
17613
17614 if (IsAddSubWith1(N0)) {
17615 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17616 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17617 }
17618
17619 if (IsAddSubWith1(N1)) {
17620 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17621 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17622 }
17623
17624 // The below optimizations require a constant RHS.
17625 if (!isa<ConstantSDNode>(N1))
17626 return SDValue();
17627
17628 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17629 const APInt &ConstValue = C->getAPIntValue();
17630
17631 // Allow the scaling to be folded into the `cnt` instruction by preventing
17632 // the scaling to be obscured here. This makes it easier to pattern match.
17633 if (IsSVECntIntrinsic(N0) ||
17634 (N0->getOpcode() == ISD::TRUNCATE &&
17635 (IsSVECntIntrinsic(N0->getOperand(0)))))
17636 if (ConstValue.sge(1) && ConstValue.sle(16))
17637 return SDValue();
17638
17639 // Multiplication of a power of two plus/minus one can be done more
17640 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17641 // future CPUs have a cheaper MADD instruction, this may need to be
17642 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17643 // 64-bit is 5 cycles, so this is always a win.
17644 // More aggressively, some multiplications N0 * C can be lowered to
17645 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17646 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17647 // TODO: lower more cases.
17648
17649 // TrailingZeroes is used to test if the mul can be lowered to
17650 // shift+add+shift.
17651 unsigned TrailingZeroes = ConstValue.countr_zero();
17652 if (TrailingZeroes) {
17653 // Conservatively do not lower to shift+add+shift if the mul might be
17654 // folded into smul or umul.
17655 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17656 isZeroExtended(N0, DAG)))
17657 return SDValue();
17658 // Conservatively do not lower to shift+add+shift if the mul might be
17659 // folded into madd or msub.
17660 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17661 N->use_begin()->getOpcode() == ISD::SUB))
17662 return SDValue();
17663 }
17664 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17665 // and shift+add+shift.
17666 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17667 unsigned ShiftAmt;
17668
17669 auto Shl = [&](SDValue N0, unsigned N1) {
17670 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17671 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17672 };
17673 auto Add = [&](SDValue N0, SDValue N1) {
17674 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17675 };
17676 auto Sub = [&](SDValue N0, SDValue N1) {
17677 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17678 };
17679 auto Negate = [&](SDValue N) {
17680 SDValue Zero = DAG.getConstant(0, DL, VT);
17681 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17682 };
17683
17684 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17685 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17686 // the (2^N - 1) can't be execused via a single instruction.
17687 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17688 unsigned BitWidth = C.getBitWidth();
17689 for (unsigned i = 1; i < BitWidth / 2; i++) {
17690 APInt Rem;
17691 APInt X(BitWidth, (1 << i) + 1);
17692 APInt::sdivrem(C, X, N, Rem);
17693 APInt NVMinus1 = N - 1;
17694 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17695 M = X;
17696 return true;
17697 }
17698 }
17699 return false;
17700 };
17701
17702 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
17703 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
17704 // the (2^N - 1) can't be execused via a single instruction.
17705 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
17706 APInt CVMinus1 = C - 1;
17707 if (CVMinus1.isNegative())
17708 return false;
17709 unsigned TrailingZeroes = CVMinus1.countr_zero();
17710 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
17711 if (SCVMinus1.isPowerOf2()) {
17712 unsigned BitWidth = SCVMinus1.getBitWidth();
17713 M = APInt(BitWidth, SCVMinus1.logBase2());
17714 N = APInt(BitWidth, TrailingZeroes);
17715 return true;
17716 }
17717 return false;
17718 };
17719
17720 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
17721 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
17722 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
17723 APInt CVMinus1 = C - 1;
17724 if (CVMinus1.isNegative())
17725 return false;
17726 unsigned TrailingZeroes = CVMinus1.countr_zero();
17727 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
17728 if (CVPlus1.isPowerOf2()) {
17729 unsigned BitWidth = CVPlus1.getBitWidth();
17730 M = APInt(BitWidth, CVPlus1.logBase2());
17731 N = APInt(BitWidth, TrailingZeroes);
17732 return true;
17733 }
17734 return false;
17735 };
17736
17737 if (ConstValue.isNonNegative()) {
17738 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17739 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17740 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17741 // (mul x, (2^M + 1) * (2^N + 1))
17742 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17743 // (mul x, (2^M + 1) * 2^N + 1))
17744 // => MV = add (shl x, M), x); add (shl MV, N), x)
17745 // (mul x, 1 - (1 - 2^M) * 2^N))
17746 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
17747 APInt SCVMinus1 = ShiftedConstValue - 1;
17748 APInt SCVPlus1 = ShiftedConstValue + 1;
17749 APInt CVPlus1 = ConstValue + 1;
17750 APInt CVM, CVN;
17751 if (SCVMinus1.isPowerOf2()) {
17752 ShiftAmt = SCVMinus1.logBase2();
17753 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17754 } else if (CVPlus1.isPowerOf2()) {
17755 ShiftAmt = CVPlus1.logBase2();
17756 return Sub(Shl(N0, ShiftAmt), N0);
17757 } else if (SCVPlus1.isPowerOf2()) {
17758 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17759 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17760 }
17761 if (Subtarget->hasALULSLFast() &&
17762 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17763 APInt CVMMinus1 = CVM - 1;
17764 APInt CVNMinus1 = CVN - 1;
17765 unsigned ShiftM1 = CVMMinus1.logBase2();
17766 unsigned ShiftN1 = CVNMinus1.logBase2();
17767 // ALULSLFast implicate that Shifts <= 4 places are fast
17768 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
17769 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17770 return Add(Shl(MVal, ShiftN1), MVal);
17771 }
17772 }
17773 if (Subtarget->hasALULSLFast() &&
17774 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
17775 unsigned ShiftM = CVM.getZExtValue();
17776 unsigned ShiftN = CVN.getZExtValue();
17777 // ALULSLFast implicate that Shifts <= 4 places are fast
17778 if (ShiftM <= 4 && ShiftN <= 4) {
17779 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
17780 return Add(Shl(MVal, CVN.getZExtValue()), N0);
17781 }
17782 }
17783
17784 if (Subtarget->hasALULSLFast() &&
17785 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
17786 unsigned ShiftM = CVM.getZExtValue();
17787 unsigned ShiftN = CVN.getZExtValue();
17788 // ALULSLFast implicate that Shifts <= 4 places are fast
17789 if (ShiftM <= 4 && ShiftN <= 4) {
17790 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
17791 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
17792 }
17793 }
17794 } else {
17795 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17796 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17797 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17798 APInt SCVPlus1 = -ShiftedConstValue + 1;
17799 APInt CVNegPlus1 = -ConstValue + 1;
17800 APInt CVNegMinus1 = -ConstValue - 1;
17801 if (CVNegPlus1.isPowerOf2()) {
17802 ShiftAmt = CVNegPlus1.logBase2();
17803 return Sub(N0, Shl(N0, ShiftAmt));
17804 } else if (CVNegMinus1.isPowerOf2()) {
17805 ShiftAmt = CVNegMinus1.logBase2();
17806 return Negate(Add(Shl(N0, ShiftAmt), N0));
17807 } else if (SCVPlus1.isPowerOf2()) {
17808 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17809 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17810 }
17811 }
17812
17813 return SDValue();
17814}
17815
17817 SelectionDAG &DAG) {
17818 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17819 // optimize away operation when it's from a constant.
17820 //
17821 // The general transformation is:
17822 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17823 // AND(VECTOR_CMP(x,y), constant2)
17824 // constant2 = UNARYOP(constant)
17825
17826 // Early exit if this isn't a vector operation, the operand of the
17827 // unary operation isn't a bitwise AND, or if the sizes of the operations
17828 // aren't the same.
17829 EVT VT = N->getValueType(0);
17830 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17831 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17832 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17833 return SDValue();
17834
17835 // Now check that the other operand of the AND is a constant. We could
17836 // make the transformation for non-constant splats as well, but it's unclear
17837 // that would be a benefit as it would not eliminate any operations, just
17838 // perform one more step in scalar code before moving to the vector unit.
17839 if (BuildVectorSDNode *BV =
17840 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17841 // Bail out if the vector isn't a constant.
17842 if (!BV->isConstant())
17843 return SDValue();
17844
17845 // Everything checks out. Build up the new and improved node.
17846 SDLoc DL(N);
17847 EVT IntVT = BV->getValueType(0);
17848 // Create a new constant of the appropriate type for the transformed
17849 // DAG.
17850 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17851 // The AND node needs bitcasts to/from an integer vector type around it.
17852 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17853 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17854 N->getOperand(0)->getOperand(0), MaskConst);
17855 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17856 return Res;
17857 }
17858
17859 return SDValue();
17860}
17861
17863 const AArch64Subtarget *Subtarget) {
17864 // First try to optimize away the conversion when it's conditionally from
17865 // a constant. Vectors only.
17867 return Res;
17868
17869 EVT VT = N->getValueType(0);
17870 if (VT != MVT::f32 && VT != MVT::f64)
17871 return SDValue();
17872
17873 // Only optimize when the source and destination types have the same width.
17874 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17875 return SDValue();
17876
17877 // If the result of an integer load is only used by an integer-to-float
17878 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17879 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17880 SDValue N0 = N->getOperand(0);
17881 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17882 N0.hasOneUse() &&
17883 // Do not change the width of a volatile load.
17884 !cast<LoadSDNode>(N0)->isVolatile()) {
17885 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17886 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17887 LN0->getPointerInfo(), LN0->getAlign(),
17888 LN0->getMemOperand()->getFlags());
17889
17890 // Make sure successors of the original load stay after it by updating them
17891 // to use the new Chain.
17892 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17893
17894 unsigned Opcode =
17896 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17897 }
17898
17899 return SDValue();
17900}
17901
17902/// Fold a floating-point multiply by power of two into floating-point to
17903/// fixed-point conversion.
17906 const AArch64Subtarget *Subtarget) {
17907 if (!Subtarget->isNeonAvailable())
17908 return SDValue();
17909
17910 if (!N->getValueType(0).isSimple())
17911 return SDValue();
17912
17913 SDValue Op = N->getOperand(0);
17914 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17915 return SDValue();
17916
17917 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17918 return SDValue();
17919
17920 SDValue ConstVec = Op->getOperand(1);
17921 if (!isa<BuildVectorSDNode>(ConstVec))
17922 return SDValue();
17923
17924 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17925 uint32_t FloatBits = FloatTy.getSizeInBits();
17926 if (FloatBits != 32 && FloatBits != 64 &&
17927 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17928 return SDValue();
17929
17930 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17931 uint32_t IntBits = IntTy.getSizeInBits();
17932 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17933 return SDValue();
17934
17935 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17936 if (IntBits > FloatBits)
17937 return SDValue();
17938
17939 BitVector UndefElements;
17940 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17941 int32_t Bits = IntBits == 64 ? 64 : 32;
17942 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17943 if (C == -1 || C == 0 || C > Bits)
17944 return SDValue();
17945
17946 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17947 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17948 return SDValue();
17949
17950 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17951 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17952 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17953 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17954 return SDValue();
17955 }
17956
17957 SDLoc DL(N);
17958 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17959 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17960 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17961 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17962 SDValue FixConv =
17964 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17965 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17966 // We can handle smaller integers by generating an extra trunc.
17967 if (IntBits < FloatBits)
17968 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17969
17970 return FixConv;
17971}
17972
17974 const AArch64TargetLowering &TLI) {
17975 EVT VT = N->getValueType(0);
17976 SelectionDAG &DAG = DCI.DAG;
17977 SDLoc DL(N);
17978 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17979
17980 if (!VT.isVector())
17981 return SDValue();
17982
17983 // The combining code works for NEON, SVE2 and SME.
17984 if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
17985 (VT.isScalableVector() && !Subtarget.hasSVE2()))
17986 return SDValue();
17987
17988 SDValue N0 = N->getOperand(0);
17989 if (N0.getOpcode() != ISD::AND)
17990 return SDValue();
17991
17992 SDValue N1 = N->getOperand(1);
17993 if (N1.getOpcode() != ISD::AND)
17994 return SDValue();
17995
17996 // InstCombine does (not (neg a)) => (add a -1).
17997 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17998 // Loop over all combinations of AND operands.
17999 for (int i = 1; i >= 0; --i) {
18000 for (int j = 1; j >= 0; --j) {
18001 SDValue O0 = N0->getOperand(i);
18002 SDValue O1 = N1->getOperand(j);
18003 SDValue Sub, Add, SubSibling, AddSibling;
18004
18005 // Find a SUB and an ADD operand, one from each AND.
18006 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18007 Sub = O0;
18008 Add = O1;
18009 SubSibling = N0->getOperand(1 - i);
18010 AddSibling = N1->getOperand(1 - j);
18011 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18012 Add = O0;
18013 Sub = O1;
18014 AddSibling = N0->getOperand(1 - i);
18015 SubSibling = N1->getOperand(1 - j);
18016 } else
18017 continue;
18018
18020 continue;
18021
18022 // Constant ones is always righthand operand of the Add.
18023 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18024 continue;
18025
18026 if (Sub.getOperand(1) != Add.getOperand(0))
18027 continue;
18028
18029 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18030 }
18031 }
18032
18033 // (or (and a b) (and (not a) c)) => (bsl a b c)
18034 // We only have to look for constant vectors here since the general, variable
18035 // case can be handled in TableGen.
18036 unsigned Bits = VT.getScalarSizeInBits();
18037 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18038 for (int i = 1; i >= 0; --i)
18039 for (int j = 1; j >= 0; --j) {
18040 APInt Val1, Val2;
18041
18042 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18044 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18045 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18046 N0->getOperand(1 - i), N1->getOperand(1 - j));
18047 }
18048 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18049 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18050 if (!BVN0 || !BVN1)
18051 continue;
18052
18053 bool FoundMatch = true;
18054 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18055 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18056 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18057 if (!CN0 || !CN1 ||
18058 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18059 FoundMatch = false;
18060 break;
18061 }
18062 }
18063 if (FoundMatch)
18064 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18065 N0->getOperand(1 - i), N1->getOperand(1 - j));
18066 }
18067
18068 return SDValue();
18069}
18070
18071// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18072// convert to csel(ccmp(.., cc0)), depending on cc1:
18073
18074// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18075// =>
18076// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18077//
18078// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18079// =>
18080// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18082 EVT VT = N->getValueType(0);
18083 SDValue CSel0 = N->getOperand(0);
18084 SDValue CSel1 = N->getOperand(1);
18085
18086 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18087 CSel1.getOpcode() != AArch64ISD::CSEL)
18088 return SDValue();
18089
18090 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18091 return SDValue();
18092
18093 if (!isNullConstant(CSel0.getOperand(0)) ||
18094 !isOneConstant(CSel0.getOperand(1)) ||
18095 !isNullConstant(CSel1.getOperand(0)) ||
18096 !isOneConstant(CSel1.getOperand(1)))
18097 return SDValue();
18098
18099 SDValue Cmp0 = CSel0.getOperand(3);
18100 SDValue Cmp1 = CSel1.getOperand(3);
18103 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18104 return SDValue();
18105 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18106 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18107 std::swap(Cmp0, Cmp1);
18108 std::swap(CC0, CC1);
18109 }
18110
18111 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18112 return SDValue();
18113
18114 SDLoc DL(N);
18115 SDValue CCmp, Condition;
18116 unsigned NZCV;
18117
18118 if (N->getOpcode() == ISD::AND) {
18120 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18122 } else {
18124 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18126 }
18127
18128 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18129
18130 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18131 if (Op1 && Op1->getAPIntValue().isNegative() &&
18132 Op1->getAPIntValue().sgt(-32)) {
18133 // CCMP accept the constant int the range [0, 31]
18134 // if the Op1 is a constant in the range [-31, -1], we
18135 // can select to CCMN to avoid the extra mov
18136 SDValue AbsOp1 =
18137 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18138 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18139 NZCVOp, Condition, Cmp0);
18140 } else {
18141 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18142 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18143 }
18144 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18145 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18146 CCmp);
18147}
18148
18150 const AArch64Subtarget *Subtarget,
18151 const AArch64TargetLowering &TLI) {
18152 SelectionDAG &DAG = DCI.DAG;
18153 EVT VT = N->getValueType(0);
18154
18155 if (SDValue R = performANDORCSELCombine(N, DAG))
18156 return R;
18157
18158 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18159 return SDValue();
18160
18161 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18162 return Res;
18163
18164 return SDValue();
18165}
18166
18168 if (!MemVT.getVectorElementType().isSimple())
18169 return false;
18170
18171 uint64_t MaskForTy = 0ull;
18172 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18173 case MVT::i8:
18174 MaskForTy = 0xffull;
18175 break;
18176 case MVT::i16:
18177 MaskForTy = 0xffffull;
18178 break;
18179 case MVT::i32:
18180 MaskForTy = 0xffffffffull;
18181 break;
18182 default:
18183 return false;
18184 break;
18185 }
18186
18187 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18188 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18189 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18190
18191 return false;
18192}
18193
18195 SDValue LeafOp = SDValue(N, 0);
18196 SDValue Op = N->getOperand(0);
18197 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18198 LeafOp.getValueType() != Op.getValueType())
18199 Op = Op->getOperand(0);
18200 if (LeafOp.getValueType() == Op.getValueType())
18201 return Op;
18202 return SDValue();
18203}
18204
18207 SelectionDAG &DAG = DCI.DAG;
18208 SDValue Src = N->getOperand(0);
18209 unsigned Opc = Src->getOpcode();
18210
18211 // Zero/any extend of an unsigned unpack
18212 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18213 SDValue UnpkOp = Src->getOperand(0);
18214 SDValue Dup = N->getOperand(1);
18215
18216 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18217 return SDValue();
18218
18219 SDLoc DL(N);
18220 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18221 if (!C)
18222 return SDValue();
18223
18224 uint64_t ExtVal = C->getZExtValue();
18225
18226 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18227 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18228 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18229 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18230 };
18231
18232 // If the mask is fully covered by the unpack, we don't need to push
18233 // a new AND onto the operand
18234 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18235 if (MaskAndTypeMatch(EltTy))
18236 return Src;
18237
18238 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18239 // to see if the mask is all-ones of size MemTy.
18240 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18241 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18242 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18243 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18244 if (MaskAndTypeMatch(EltTy))
18245 return Src;
18246 }
18247
18248 // Truncate to prevent a DUP with an over wide constant
18249 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18250
18251 // Otherwise, make sure we propagate the AND to the operand
18252 // of the unpack
18253 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18254 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18255
18256 SDValue And = DAG.getNode(ISD::AND, DL,
18257 UnpkOp->getValueType(0), UnpkOp, Dup);
18258
18259 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18260 }
18261
18262 if (DCI.isBeforeLegalizeOps())
18263 return SDValue();
18264
18265 // If both sides of AND operations are i1 splat_vectors then
18266 // we can produce just i1 splat_vector as the result.
18267 if (isAllActivePredicate(DAG, N->getOperand(0)))
18268 return N->getOperand(1);
18269 if (isAllActivePredicate(DAG, N->getOperand(1)))
18270 return N->getOperand(0);
18271
18273 return SDValue();
18274
18275 SDValue Mask = N->getOperand(1);
18276
18277 if (!Src.hasOneUse())
18278 return SDValue();
18279
18280 EVT MemVT;
18281
18282 // SVE load instructions perform an implicit zero-extend, which makes them
18283 // perfect candidates for combining.
18284 switch (Opc) {
18288 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18289 break;
18305 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18306 break;
18307 default:
18308 return SDValue();
18309 }
18310
18311 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18312 return Src;
18313
18314 return SDValue();
18315}
18316
18317// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18320
18321 // This function performs an optimization on a specific pattern involving
18322 // an AND operation and SETCC (Set Condition Code) node.
18323
18324 SDValue SetCC = N->getOperand(0);
18325 EVT VT = N->getValueType(0);
18326 SelectionDAG &DAG = DCI.DAG;
18327
18328 // Checks if the current node (N) is used by any SELECT instruction and
18329 // returns an empty SDValue to avoid applying the optimization to prevent
18330 // incorrect results
18331 for (auto U : N->uses())
18332 if (U->getOpcode() == ISD::SELECT)
18333 return SDValue();
18334
18335 // Check if the operand is a SETCC node with floating-point comparison
18336 if (SetCC.getOpcode() == ISD::SETCC &&
18337 SetCC.getOperand(0).getValueType() == MVT::f32) {
18338
18339 SDValue Cmp;
18341
18342 // Check if the DAG is after legalization and if we can emit the conjunction
18343 if (!DCI.isBeforeLegalize() &&
18344 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18345
18347
18348 SDLoc DL(N);
18349 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18350 DAG.getConstant(0, DL, VT),
18351 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18352 }
18353 }
18354 return SDValue();
18355}
18356
18359 SelectionDAG &DAG = DCI.DAG;
18360 SDValue LHS = N->getOperand(0);
18361 SDValue RHS = N->getOperand(1);
18362 EVT VT = N->getValueType(0);
18363
18364 if (SDValue R = performANDORCSELCombine(N, DAG))
18365 return R;
18366
18367 if (SDValue R = performANDSETCCCombine(N,DCI))
18368 return R;
18369
18370 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18371 return SDValue();
18372
18373 if (VT.isScalableVector())
18374 return performSVEAndCombine(N, DCI);
18375
18376 // The combining code below works only for NEON vectors. In particular, it
18377 // does not work for SVE when dealing with vectors wider than 128 bits.
18378 if (!VT.is64BitVector() && !VT.is128BitVector())
18379 return SDValue();
18380
18381 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18382 if (!BVN)
18383 return SDValue();
18384
18385 // AND does not accept an immediate, so check if we can use a BIC immediate
18386 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18387 // pattern in isel, because some immediates may be lowered to the preferred
18388 // (and x, (movi imm)) form, even though an mvni representation also exists.
18389 APInt DefBits(VT.getSizeInBits(), 0);
18390 APInt UndefBits(VT.getSizeInBits(), 0);
18391 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18392 SDValue NewOp;
18393
18394 // Any bits known to already be 0 need not be cleared again, which can help
18395 // reduce the size of the immediate to one supported by the instruction.
18396 KnownBits Known = DAG.computeKnownBits(LHS);
18397 APInt ZeroSplat(VT.getSizeInBits(), 0);
18398 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18399 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18400 << (Known.Zero.getBitWidth() * I);
18401
18402 DefBits = ~(DefBits | ZeroSplat);
18403 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18404 DefBits, &LHS)) ||
18405 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18406 DefBits, &LHS)))
18407 return NewOp;
18408
18409 UndefBits = ~(UndefBits | ZeroSplat);
18410 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18411 UndefBits, &LHS)) ||
18412 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18413 UndefBits, &LHS)))
18414 return NewOp;
18415 }
18416
18417 return SDValue();
18418}
18419
18422 SelectionDAG &DAG = DCI.DAG;
18423 SDValue LHS = N->getOperand(0);
18424 SDValue RHS = N->getOperand(1);
18425 EVT VT = N->getValueType(0);
18426 SDLoc DL(N);
18427
18428 if (!N->getFlags().hasAllowReassociation())
18429 return SDValue();
18430
18431 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18432 auto ReassocComplex = [&](SDValue A, SDValue B) {
18433 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18434 return SDValue();
18435 unsigned Opc = A.getConstantOperandVal(0);
18436 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18437 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18438 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18439 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18440 return SDValue();
18441 SDValue VCMLA = DAG.getNode(
18442 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18443 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18444 A.getOperand(2), A.getOperand(3));
18445 VCMLA->setFlags(A->getFlags());
18446 return VCMLA;
18447 };
18448 if (SDValue R = ReassocComplex(LHS, RHS))
18449 return R;
18450 if (SDValue R = ReassocComplex(RHS, LHS))
18451 return R;
18452
18453 return SDValue();
18454}
18455
18456static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18457 switch (Opcode) {
18458 case ISD::STRICT_FADD:
18459 case ISD::FADD:
18460 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18461 case ISD::ADD:
18462 return VT == MVT::i64;
18463 default:
18464 return false;
18465 }
18466}
18467
18468static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18470
18472 if ((N.getOpcode() == ISD::SETCC) ||
18473 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18474 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18475 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18476 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18477 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18478 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18479 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18480 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18481 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18482 // get_active_lane_mask is lowered to a whilelo instruction.
18483 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18484 return true;
18485
18486 return false;
18487}
18488
18489// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18490// ... into: "ptrue p, all" + PTEST
18491static SDValue
18494 const AArch64Subtarget *Subtarget) {
18495 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18496 // Make sure PTEST can be legalised with illegal types.
18497 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18498 return SDValue();
18499
18500 SDValue N0 = N->getOperand(0);
18501 EVT VT = N0.getValueType();
18502
18503 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18504 !isNullConstant(N->getOperand(1)))
18505 return SDValue();
18506
18507 // Restricted the DAG combine to only cases where we're extracting from a
18508 // flag-setting operation.
18509 if (!isPredicateCCSettingOp(N0))
18510 return SDValue();
18511
18512 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18513 SelectionDAG &DAG = DCI.DAG;
18514 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18515 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18516}
18517
18518// Materialize : Idx = (add (mul vscale, NumEls), -1)
18519// i1 = extract_vector_elt t37, Constant:i64<Idx>
18520// ... into: "ptrue p, all" + PTEST
18521static SDValue
18524 const AArch64Subtarget *Subtarget) {
18525 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18526 // Make sure PTEST is legal types.
18527 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18528 return SDValue();
18529
18530 SDValue N0 = N->getOperand(0);
18531 EVT OpVT = N0.getValueType();
18532
18533 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18534 return SDValue();
18535
18536 // Idx == (add (mul vscale, NumEls), -1)
18537 SDValue Idx = N->getOperand(1);
18538 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18539 return SDValue();
18540
18541 SDValue VS = Idx.getOperand(0);
18542 if (VS.getOpcode() != ISD::VSCALE)
18543 return SDValue();
18544
18545 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18546 if (VS.getConstantOperandVal(0) != NumEls)
18547 return SDValue();
18548
18549 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18550 SelectionDAG &DAG = DCI.DAG;
18551 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18552 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18553}
18554
18555static SDValue
18557 const AArch64Subtarget *Subtarget) {
18558 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18559 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18560 return Res;
18561 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18562 return Res;
18563
18564 SelectionDAG &DAG = DCI.DAG;
18565 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18566
18567 EVT VT = N->getValueType(0);
18568 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18569 bool IsStrict = N0->isStrictFPOpcode();
18570
18571 // extract(dup x) -> x
18572 if (N0.getOpcode() == AArch64ISD::DUP)
18573 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18574 : N0.getOperand(0);
18575
18576 // Rewrite for pairwise fadd pattern
18577 // (f32 (extract_vector_elt
18578 // (fadd (vXf32 Other)
18579 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18580 // ->
18581 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18582 // (extract_vector_elt (vXf32 Other) 1))
18583 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18584 // we can only do this when it's used only by the extract_vector_elt.
18585 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18586 (!IsStrict || N0.hasOneUse())) {
18587 SDLoc DL(N0);
18588 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18589 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18590
18591 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18592 SDValue Other = N00;
18593
18594 // And handle the commutative case.
18595 if (!Shuffle) {
18596 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18597 Other = N01;
18598 }
18599
18600 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18601 Other == Shuffle->getOperand(0)) {
18602 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18603 DAG.getConstant(0, DL, MVT::i64));
18604 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18605 DAG.getConstant(1, DL, MVT::i64));
18606 if (!IsStrict)
18607 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18608
18609 // For strict_fadd we need uses of the final extract_vector to be replaced
18610 // with the strict_fadd, but we also need uses of the chain output of the
18611 // original strict_fadd to use the chain output of the new strict_fadd as
18612 // otherwise it may not be deleted.
18613 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18614 {VT, MVT::Other},
18615 {N0->getOperand(0), Extract1, Extract2});
18616 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18617 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18618 return SDValue(N, 0);
18619 }
18620 }
18621
18622 return SDValue();
18623}
18624
18627 SelectionDAG &DAG) {
18628 SDLoc dl(N);
18629 EVT VT = N->getValueType(0);
18630 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18631 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18632
18633 if (VT.isScalableVector())
18634 return SDValue();
18635
18636 // Optimize concat_vectors of truncated vectors, where the intermediate
18637 // type is illegal, to avoid said illegality, e.g.,
18638 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18639 // (v2i16 (truncate (v2i64)))))
18640 // ->
18641 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18642 // (v4i32 (bitcast (v2i64))),
18643 // <0, 2, 4, 6>)))
18644 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18645 // on both input and result type, so we might generate worse code.
18646 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18647 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18648 N1Opc == ISD::TRUNCATE) {
18649 SDValue N00 = N0->getOperand(0);
18650 SDValue N10 = N1->getOperand(0);
18651 EVT N00VT = N00.getValueType();
18652
18653 if (N00VT == N10.getValueType() &&
18654 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18655 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18656 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18658 for (size_t i = 0; i < Mask.size(); ++i)
18659 Mask[i] = i * 2;
18660 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18661 DAG.getVectorShuffle(
18662 MidVT, dl,
18663 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18664 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18665 }
18666 }
18667
18668 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18669 N->getOperand(0).getValueType() == MVT::v2i16 ||
18670 N->getOperand(0).getValueType() == MVT::v2i8) {
18671 EVT SrcVT = N->getOperand(0).getValueType();
18672 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18673 // loads to prevent having to go through the v4i8 load legalization that
18674 // needs to extend each element into a larger type.
18675 if (N->getNumOperands() % 2 == 0 &&
18676 all_of(N->op_values(), [SrcVT](SDValue V) {
18677 if (V.getValueType() != SrcVT)
18678 return false;
18679 if (V.isUndef())
18680 return true;
18681 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18682 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18683 LD->getExtensionType() == ISD::NON_EXTLOAD;
18684 })) {
18685 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18686 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18688
18689 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18690 SDValue V = N->getOperand(i);
18691 if (V.isUndef())
18692 Ops.push_back(DAG.getUNDEF(FVT));
18693 else {
18694 LoadSDNode *LD = cast<LoadSDNode>(V);
18695 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18696 LD->getBasePtr(), LD->getMemOperand());
18697 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18698 Ops.push_back(NewLoad);
18699 }
18700 }
18701 return DAG.getBitcast(N->getValueType(0),
18702 DAG.getBuildVector(NVT, dl, Ops));
18703 }
18704 }
18705
18706 // Canonicalise concat_vectors to replace concatenations of truncated nots
18707 // with nots of concatenated truncates. This in some cases allows for multiple
18708 // redundant negations to be eliminated.
18709 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18710 // (v4i16 (truncate (not (v4i32)))))
18711 // ->
18712 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18713 // (v4i16 (truncate (v4i32)))))
18714 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18715 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18716 N->isOnlyUserOf(N1.getNode())) {
18717 auto isBitwiseVectorNegate = [](SDValue V) {
18718 return V->getOpcode() == ISD::XOR &&
18719 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18720 };
18721 SDValue N00 = N0->getOperand(0);
18722 SDValue N10 = N1->getOperand(0);
18723 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18724 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18725 return DAG.getNOT(
18726 dl,
18727 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18728 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18729 N00->getOperand(0)),
18730 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18731 N10->getOperand(0))),
18732 VT);
18733 }
18734 }
18735
18736 // Wait till after everything is legalized to try this. That way we have
18737 // legal vector types and such.
18738 if (DCI.isBeforeLegalizeOps())
18739 return SDValue();
18740
18741 // Optimise concat_vectors of two identical binops with a 128-bit destination
18742 // size, combine into an binop of two contacts of the source vectors. eg:
18743 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
18744 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18745 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
18746 N1->hasOneUse()) {
18747 SDValue N00 = N0->getOperand(0);
18748 SDValue N01 = N0->getOperand(1);
18749 SDValue N10 = N1->getOperand(0);
18750 SDValue N11 = N1->getOperand(1);
18751
18752 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18753 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18754 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18755 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18756 }
18757 }
18758
18759 auto IsRSHRN = [](SDValue Shr) {
18760 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18761 return false;
18762 SDValue Op = Shr.getOperand(0);
18763 EVT VT = Op.getValueType();
18764 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18765 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18766 return false;
18767
18768 APInt Imm;
18769 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18770 Imm = APInt(VT.getScalarSizeInBits(),
18771 Op.getOperand(1).getConstantOperandVal(0)
18772 << Op.getOperand(1).getConstantOperandVal(1));
18773 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18774 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18775 Imm = APInt(VT.getScalarSizeInBits(),
18776 Op.getOperand(1).getConstantOperandVal(0));
18777 else
18778 return false;
18779
18780 if (Imm != 1ULL << (ShtAmt - 1))
18781 return false;
18782 return true;
18783 };
18784
18785 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18786 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18787 ((IsRSHRN(N1) &&
18789 N1.isUndef())) {
18790 SDValue X = N0.getOperand(0).getOperand(0);
18791 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18792 : N1.getOperand(0).getOperand(0);
18793 EVT BVT =
18794 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18795 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18796 SDValue Add = DAG.getNode(
18797 ISD::ADD, dl, BVT, CC,
18798 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18799 SDValue Shr =
18800 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18801 return Shr;
18802 }
18803
18804 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18805 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18806 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18807 N0.getOperand(1) == N1.getOperand(1)) {
18808 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18809 DAG.getUNDEF(N0.getValueType()));
18810 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18811 DAG.getUNDEF(N0.getValueType()));
18812 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18813 }
18814
18815 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18816 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18817 // canonicalise to that.
18818 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18819 assert(VT.getScalarSizeInBits() == 64);
18820 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18821 DAG.getConstant(0, dl, MVT::i64));
18822 }
18823
18824 // Canonicalise concat_vectors so that the right-hand vector has as few
18825 // bit-casts as possible before its real operation. The primary matching
18826 // destination for these operations will be the narrowing "2" instructions,
18827 // which depend on the operation being performed on this right-hand vector.
18828 // For example,
18829 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18830 // becomes
18831 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18832
18833 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18834 return SDValue();
18835 SDValue RHS = N1->getOperand(0);
18836 MVT RHSTy = RHS.getValueType().getSimpleVT();
18837 // If the RHS is not a vector, this is not the pattern we're looking for.
18838 if (!RHSTy.isVector())
18839 return SDValue();
18840
18841 LLVM_DEBUG(
18842 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18843
18844 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18845 RHSTy.getVectorNumElements() * 2);
18846 return DAG.getNode(ISD::BITCAST, dl, VT,
18847 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18848 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18849 RHS));
18850}
18851
18852static SDValue
18854 SelectionDAG &DAG) {
18855 if (DCI.isBeforeLegalizeOps())
18856 return SDValue();
18857
18858 EVT VT = N->getValueType(0);
18859 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18860 return SDValue();
18861
18862 SDValue V = N->getOperand(0);
18863
18864 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18865 // blocks this combine because the non-const case requires custom lowering.
18866 //
18867 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18868 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18869 if (isa<ConstantSDNode>(V.getOperand(0)))
18870 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18871
18872 return SDValue();
18873}
18874
18875static SDValue
18877 SelectionDAG &DAG) {
18878 SDLoc DL(N);
18879 SDValue Vec = N->getOperand(0);
18880 SDValue SubVec = N->getOperand(1);
18881 uint64_t IdxVal = N->getConstantOperandVal(2);
18882 EVT VecVT = Vec.getValueType();
18883 EVT SubVT = SubVec.getValueType();
18884
18885 // Only do this for legal fixed vector types.
18886 if (!VecVT.isFixedLengthVector() ||
18887 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18888 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18889 return SDValue();
18890
18891 // Ignore widening patterns.
18892 if (IdxVal == 0 && Vec.isUndef())
18893 return SDValue();
18894
18895 // Subvector must be half the width and an "aligned" insertion.
18896 unsigned NumSubElts = SubVT.getVectorNumElements();
18897 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18898 (IdxVal != 0 && IdxVal != NumSubElts))
18899 return SDValue();
18900
18901 // Fold insert_subvector -> concat_vectors
18902 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18903 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18904 SDValue Lo, Hi;
18905 if (IdxVal == 0) {
18906 Lo = SubVec;
18907 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18908 DAG.getVectorIdxConstant(NumSubElts, DL));
18909 } else {
18910 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18911 DAG.getVectorIdxConstant(0, DL));
18912 Hi = SubVec;
18913 }
18914 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18915}
18916
18919 SelectionDAG &DAG) {
18920 // Wait until after everything is legalized to try this. That way we have
18921 // legal vector types and such.
18922 if (DCI.isBeforeLegalizeOps())
18923 return SDValue();
18924 // Transform a scalar conversion of a value from a lane extract into a
18925 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18926 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18927 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18928 //
18929 // The second form interacts better with instruction selection and the
18930 // register allocator to avoid cross-class register copies that aren't
18931 // coalescable due to a lane reference.
18932
18933 // Check the operand and see if it originates from a lane extract.
18934 SDValue Op1 = N->getOperand(1);
18936 return SDValue();
18937
18938 // Yep, no additional predication needed. Perform the transform.
18939 SDValue IID = N->getOperand(0);
18940 SDValue Shift = N->getOperand(2);
18941 SDValue Vec = Op1.getOperand(0);
18942 SDValue Lane = Op1.getOperand(1);
18943 EVT ResTy = N->getValueType(0);
18944 EVT VecResTy;
18945 SDLoc DL(N);
18946
18947 // The vector width should be 128 bits by the time we get here, even
18948 // if it started as 64 bits (the extract_vector handling will have
18949 // done so). Bail if it is not.
18950 if (Vec.getValueSizeInBits() != 128)
18951 return SDValue();
18952
18953 if (Vec.getValueType() == MVT::v4i32)
18954 VecResTy = MVT::v4f32;
18955 else if (Vec.getValueType() == MVT::v2i64)
18956 VecResTy = MVT::v2f64;
18957 else
18958 return SDValue();
18959
18960 SDValue Convert =
18961 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18962 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18963}
18964
18965// AArch64 high-vector "long" operations are formed by performing the non-high
18966// version on an extract_subvector of each operand which gets the high half:
18967//
18968// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18969//
18970// However, there are cases which don't have an extract_high explicitly, but
18971// have another operation that can be made compatible with one for free. For
18972// example:
18973//
18974// (dupv64 scalar) --> (extract_high (dup128 scalar))
18975//
18976// This routine does the actual conversion of such DUPs, once outer routines
18977// have determined that everything else is in order.
18978// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18979// similarly here.
18981 MVT VT = N.getSimpleValueType();
18982 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18983 N.getConstantOperandVal(1) == 0)
18984 N = N.getOperand(0);
18985
18986 switch (N.getOpcode()) {
18987 case AArch64ISD::DUP:
18992 case AArch64ISD::MOVI:
18998 break;
18999 default:
19000 // FMOV could be supported, but isn't very useful, as it would only occur
19001 // if you passed a bitcast' floating point immediate to an eligible long
19002 // integer op (addl, smull, ...).
19003 return SDValue();
19004 }
19005
19006 if (!VT.is64BitVector())
19007 return SDValue();
19008
19009 SDLoc DL(N);
19010 unsigned NumElems = VT.getVectorNumElements();
19011 if (N.getValueType().is64BitVector()) {
19012 MVT ElementTy = VT.getVectorElementType();
19013 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19014 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19015 }
19016
19017 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19018 DAG.getConstant(NumElems, DL, MVT::i64));
19019}
19020
19022 if (N.getOpcode() == ISD::BITCAST)
19023 N = N.getOperand(0);
19024 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19025 return false;
19026 if (N.getOperand(0).getValueType().isScalableVector())
19027 return false;
19028 return N.getConstantOperandAPInt(1) ==
19029 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19030}
19031
19032/// Helper structure to keep track of ISD::SET_CC operands.
19037};
19038
19039/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19041 const SDValue *Cmp;
19043};
19044
19045/// Helper structure to keep track of SetCC information.
19049};
19050
19051/// Helper structure to be able to read SetCC information. If set to
19052/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19053/// GenericSetCCInfo.
19057};
19058
19059/// Check whether or not \p Op is a SET_CC operation, either a generic or
19060/// an
19061/// AArch64 lowered one.
19062/// \p SetCCInfo is filled accordingly.
19063/// \post SetCCInfo is meanginfull only when this function returns true.
19064/// \return True when Op is a kind of SET_CC operation.
19066 // If this is a setcc, this is straight forward.
19067 if (Op.getOpcode() == ISD::SETCC) {
19068 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19069 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19070 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19071 SetCCInfo.IsAArch64 = false;
19072 return true;
19073 }
19074 // Otherwise, check if this is a matching csel instruction.
19075 // In other words:
19076 // - csel 1, 0, cc
19077 // - csel 0, 1, !cc
19078 if (Op.getOpcode() != AArch64ISD::CSEL)
19079 return false;
19080 // Set the information about the operands.
19081 // TODO: we want the operands of the Cmp not the csel
19082 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19083 SetCCInfo.IsAArch64 = true;
19084 SetCCInfo.Info.AArch64.CC =
19085 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19086
19087 // Check that the operands matches the constraints:
19088 // (1) Both operands must be constants.
19089 // (2) One must be 1 and the other must be 0.
19090 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19091 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19092
19093 // Check (1).
19094 if (!TValue || !FValue)
19095 return false;
19096
19097 // Check (2).
19098 if (!TValue->isOne()) {
19099 // Update the comparison when we are interested in !cc.
19100 std::swap(TValue, FValue);
19101 SetCCInfo.Info.AArch64.CC =
19103 }
19104 return TValue->isOne() && FValue->isZero();
19105}
19106
19107// Returns true if Op is setcc or zext of setcc.
19108static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19109 if (isSetCC(Op, Info))
19110 return true;
19111 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19112 isSetCC(Op->getOperand(0), Info));
19113}
19114
19115// The folding we want to perform is:
19116// (add x, [zext] (setcc cc ...) )
19117// -->
19118// (csel x, (add x, 1), !cc ...)
19119//
19120// The latter will get matched to a CSINC instruction.
19122 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19123 SDValue LHS = Op->getOperand(0);
19124 SDValue RHS = Op->getOperand(1);
19125 SetCCInfoAndKind InfoAndKind;
19126
19127 // If both operands are a SET_CC, then we don't want to perform this
19128 // folding and create another csel as this results in more instructions
19129 // (and higher register usage).
19130 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19131 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19132 return SDValue();
19133
19134 // If neither operand is a SET_CC, give up.
19135 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19136 std::swap(LHS, RHS);
19137 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19138 return SDValue();
19139 }
19140
19141 // FIXME: This could be generatized to work for FP comparisons.
19142 EVT CmpVT = InfoAndKind.IsAArch64
19143 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19144 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19145 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19146 return SDValue();
19147
19148 SDValue CCVal;
19149 SDValue Cmp;
19150 SDLoc dl(Op);
19151 if (InfoAndKind.IsAArch64) {
19152 CCVal = DAG.getConstant(
19154 MVT::i32);
19155 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19156 } else
19157 Cmp = getAArch64Cmp(
19158 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19159 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19160 dl);
19161
19162 EVT VT = Op->getValueType(0);
19163 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19164 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19165}
19166
19167// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19169 EVT VT = N->getValueType(0);
19170 // Only scalar integer and vector types.
19171 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19172 return SDValue();
19173
19174 SDValue LHS = N->getOperand(0);
19175 SDValue RHS = N->getOperand(1);
19176 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19177 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19178 return SDValue();
19179
19180 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19181 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19182 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19183 return SDValue();
19184
19185 SDValue Op1 = LHS->getOperand(0);
19186 SDValue Op2 = RHS->getOperand(0);
19187 EVT OpVT1 = Op1.getValueType();
19188 EVT OpVT2 = Op2.getValueType();
19189 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19190 Op2.getOpcode() != AArch64ISD::UADDV ||
19191 OpVT1.getVectorElementType() != VT)
19192 return SDValue();
19193
19194 SDValue Val1 = Op1.getOperand(0);
19195 SDValue Val2 = Op2.getOperand(0);
19196 EVT ValVT = Val1->getValueType(0);
19197 SDLoc DL(N);
19198 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19199 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19200 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19201 DAG.getConstant(0, DL, MVT::i64));
19202}
19203
19204/// Perform the scalar expression combine in the form of:
19205/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19206/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19208 EVT VT = N->getValueType(0);
19209 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19210 return SDValue();
19211
19212 SDValue LHS = N->getOperand(0);
19213 SDValue RHS = N->getOperand(1);
19214
19215 // Handle commutivity.
19216 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19217 LHS.getOpcode() != AArch64ISD::CSNEG) {
19218 std::swap(LHS, RHS);
19219 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19220 LHS.getOpcode() != AArch64ISD::CSNEG) {
19221 return SDValue();
19222 }
19223 }
19224
19225 if (!LHS.hasOneUse())
19226 return SDValue();
19227
19228 AArch64CC::CondCode AArch64CC =
19229 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19230
19231 // The CSEL should include a const one operand, and the CSNEG should include
19232 // One or NegOne operand.
19233 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19234 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19235 if (!CTVal || !CFVal)
19236 return SDValue();
19237
19238 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19239 (CTVal->isOne() || CFVal->isOne())) &&
19240 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19241 (CTVal->isOne() || CFVal->isAllOnes())))
19242 return SDValue();
19243
19244 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19245 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19246 !CFVal->isOne()) {
19247 std::swap(CTVal, CFVal);
19248 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19249 }
19250
19251 SDLoc DL(N);
19252 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19253 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19254 !CFVal->isAllOnes()) {
19255 APInt C = -1 * CFVal->getAPIntValue();
19256 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19257 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19258 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19259 }
19260
19261 // It might be neutral for larger constants, as the immediate need to be
19262 // materialized in a register.
19263 APInt ADDC = CTVal->getAPIntValue();
19264 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19265 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19266 return SDValue();
19267
19268 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19269 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19270 "Unexpected constant value");
19271
19272 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19273 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19274 SDValue Cmp = LHS.getOperand(3);
19275
19276 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19277}
19278
19279// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19281 EVT VT = N->getValueType(0);
19282 if (N->getOpcode() != ISD::ADD)
19283 return SDValue();
19284
19285 SDValue Dot = N->getOperand(0);
19286 SDValue A = N->getOperand(1);
19287 // Handle commutivity
19288 auto isZeroDot = [](SDValue Dot) {
19289 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19290 Dot.getOpcode() == AArch64ISD::SDOT) &&
19292 };
19293 if (!isZeroDot(Dot))
19294 std::swap(Dot, A);
19295 if (!isZeroDot(Dot))
19296 return SDValue();
19297
19298 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19299 Dot.getOperand(2));
19300}
19301
19303 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19304}
19305
19307 SDLoc DL(Op);
19308 EVT VT = Op.getValueType();
19309 SDValue Zero = DAG.getConstant(0, DL, VT);
19310 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19311}
19312
19313// Try to fold
19314//
19315// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19316//
19317// The folding helps csel to be matched with csneg without generating
19318// redundant neg instruction, which includes negation of the csel expansion
19319// of abs node lowered by lowerABS.
19321 if (!isNegatedInteger(SDValue(N, 0)))
19322 return SDValue();
19323
19324 SDValue CSel = N->getOperand(1);
19325 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19326 return SDValue();
19327
19328 SDValue N0 = CSel.getOperand(0);
19329 SDValue N1 = CSel.getOperand(1);
19330
19331 // If both of them is not negations, it's not worth the folding as it
19332 // introduces two additional negations while reducing one negation.
19333 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19334 return SDValue();
19335
19336 SDValue N0N = getNegatedInteger(N0, DAG);
19337 SDValue N1N = getNegatedInteger(N1, DAG);
19338
19339 SDLoc DL(N);
19340 EVT VT = CSel.getValueType();
19341 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19342 CSel.getOperand(3));
19343}
19344
19345// The basic add/sub long vector instructions have variants with "2" on the end
19346// which act on the high-half of their inputs. They are normally matched by
19347// patterns like:
19348//
19349// (add (zeroext (extract_high LHS)),
19350// (zeroext (extract_high RHS)))
19351// -> uaddl2 vD, vN, vM
19352//
19353// However, if one of the extracts is something like a duplicate, this
19354// instruction can still be used profitably. This function puts the DAG into a
19355// more appropriate form for those patterns to trigger.
19358 SelectionDAG &DAG = DCI.DAG;
19359 if (DCI.isBeforeLegalizeOps())
19360 return SDValue();
19361
19362 MVT VT = N->getSimpleValueType(0);
19363 if (!VT.is128BitVector()) {
19364 if (N->getOpcode() == ISD::ADD)
19365 return performSetccAddFolding(N, DAG);
19366 return SDValue();
19367 }
19368
19369 // Make sure both branches are extended in the same way.
19370 SDValue LHS = N->getOperand(0);
19371 SDValue RHS = N->getOperand(1);
19372 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19373 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19374 LHS.getOpcode() != RHS.getOpcode())
19375 return SDValue();
19376
19377 unsigned ExtType = LHS.getOpcode();
19378
19379 // It's not worth doing if at least one of the inputs isn't already an
19380 // extract, but we don't know which it'll be so we have to try both.
19381 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19382 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19383 if (!RHS.getNode())
19384 return SDValue();
19385
19386 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19387 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19388 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19389 if (!LHS.getNode())
19390 return SDValue();
19391
19392 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19393 }
19394
19395 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19396}
19397
19398static bool isCMP(SDValue Op) {
19399 return Op.getOpcode() == AArch64ISD::SUBS &&
19400 !Op.getNode()->hasAnyUseOfValue(0);
19401}
19402
19403// (CSEL 1 0 CC Cond) => CC
19404// (CSEL 0 1 CC Cond) => !CC
19405static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19406 if (Op.getOpcode() != AArch64ISD::CSEL)
19407 return std::nullopt;
19408 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19409 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19410 return std::nullopt;
19411 SDValue OpLHS = Op.getOperand(0);
19412 SDValue OpRHS = Op.getOperand(1);
19413 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19414 return CC;
19415 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19416 return getInvertedCondCode(CC);
19417
19418 return std::nullopt;
19419}
19420
19421// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19422// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19423static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19424 SDValue CmpOp = Op->getOperand(2);
19425 if (!isCMP(CmpOp))
19426 return SDValue();
19427
19428 if (IsAdd) {
19429 if (!isOneConstant(CmpOp.getOperand(1)))
19430 return SDValue();
19431 } else {
19432 if (!isNullConstant(CmpOp.getOperand(0)))
19433 return SDValue();
19434 }
19435
19436 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19437 auto CC = getCSETCondCode(CsetOp);
19438 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19439 return SDValue();
19440
19441 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19442 Op->getOperand(0), Op->getOperand(1),
19443 CsetOp.getOperand(3));
19444}
19445
19446// (ADC x 0 cond) => (CINC x HS cond)
19448 SDValue LHS = N->getOperand(0);
19449 SDValue RHS = N->getOperand(1);
19450 SDValue Cond = N->getOperand(2);
19451
19452 if (!isNullConstant(RHS))
19453 return SDValue();
19454
19455 EVT VT = N->getValueType(0);
19456 SDLoc DL(N);
19457
19458 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19459 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19460 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19461}
19462
19463// Transform vector add(zext i8 to i32, zext i8 to i32)
19464// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19465// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19466// extends.
19468 EVT VT = N->getValueType(0);
19469 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19470 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19471 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19472 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19473 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19474 N->getOperand(0).getOperand(0).getValueType() !=
19475 N->getOperand(1).getOperand(0).getValueType())
19476 return SDValue();
19477
19478 SDValue N0 = N->getOperand(0).getOperand(0);
19479 SDValue N1 = N->getOperand(1).getOperand(0);
19480 EVT InVT = N0.getValueType();
19481
19482 EVT S1 = InVT.getScalarType();
19483 EVT S2 = VT.getScalarType();
19484 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19485 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19486 SDLoc DL(N);
19487 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19490 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19491 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19492 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19493 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19494 }
19495 return SDValue();
19496}
19497
19500 SelectionDAG &DAG) {
19501 SDLoc DL(N);
19502 EVT VT = N->getValueType(0);
19503
19504 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19505 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19506 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19507 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19508 Elt1->getOpcode() == ISD::FP_ROUND &&
19509 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19510 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19511 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19513 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19514 // Constant index.
19515 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19516 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19517 Elt0->getOperand(0)->getOperand(0) ==
19518 Elt1->getOperand(0)->getOperand(0) &&
19519 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19520 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19521 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19522 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19523 SDValue HighLanes;
19524 if (Elt2->getOpcode() == ISD::UNDEF &&
19525 Elt3->getOpcode() == ISD::UNDEF) {
19526 HighLanes = DAG.getUNDEF(MVT::v2f32);
19527 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19528 Elt3->getOpcode() == ISD::FP_ROUND &&
19529 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19530 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19531 Elt2->getConstantOperandVal(1) ==
19532 Elt3->getConstantOperandVal(1) &&
19533 Elt2->getOperand(0)->getOpcode() ==
19535 Elt3->getOperand(0)->getOpcode() ==
19537 // Constant index.
19538 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19539 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19540 Elt2->getOperand(0)->getOperand(0) ==
19541 Elt3->getOperand(0)->getOperand(0) &&
19542 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19543 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19544 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19545 HighLanes =
19546 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19547 }
19548 if (HighLanes) {
19549 SDValue DoubleToSingleSticky =
19550 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19551 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19552 DoubleToSingleSticky, HighLanes);
19553 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19554 Elt0->getOperand(1));
19555 }
19556 }
19557 }
19558 }
19559
19560 if (VT == MVT::v2f64) {
19561 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19562 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19563 Elt1->getOpcode() == ISD::FP_EXTEND &&
19565 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19566 Elt0->getOperand(0)->getOperand(0) ==
19567 Elt1->getOperand(0)->getOperand(0) &&
19568 // Constant index.
19569 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19570 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19571 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19572 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19573 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19574 // ResultType's known minimum vector length.
19575 Elt0->getOperand(0)->getConstantOperandVal(1) %
19577 0) {
19578 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19579 if (SrcVec.getValueType() == MVT::v4f16 ||
19580 SrcVec.getValueType() == MVT::v4bf16) {
19581 SDValue HalfToSingle =
19582 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19583 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19584 SDValue Extract = DAG.getNode(
19586 HalfToSingle, SubvectorIdx);
19587 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19588 }
19589 }
19590 }
19591
19592 // A build vector of two extracted elements is equivalent to an
19593 // extract subvector where the inner vector is any-extended to the
19594 // extract_vector_elt VT.
19595 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19596 // (extract_elt_iXX_to_i32 vec Idx+1))
19597 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19598
19599 // For now, only consider the v2i32 case, which arises as a result of
19600 // legalization.
19601 if (VT != MVT::v2i32)
19602 return SDValue();
19603
19604 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19605 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19606 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19607 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19608 // Constant index.
19609 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19610 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19611 // Both EXTRACT_VECTOR_ELT from same vector...
19612 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19613 // ... and contiguous. First element's index +1 == second element's index.
19614 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19615 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19616 // ResultType's known minimum vector length.
19617 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19618 SDValue VecToExtend = Elt0->getOperand(0);
19619 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19620 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19621 return SDValue();
19622
19623 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19624
19625 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19626 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19627 SubvectorIdx);
19628 }
19629
19630 return SDValue();
19631}
19632
19634 SelectionDAG &DAG) {
19635 EVT VT = N->getValueType(0);
19636 SDValue N0 = N->getOperand(0);
19637 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19638 N0.getOpcode() == AArch64ISD::DUP) {
19639 SDValue Op = N0.getOperand(0);
19640 if (VT.getScalarType() == MVT::i32 &&
19641 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19642 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19643 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19644 }
19645
19646 return SDValue();
19647}
19648
19649// Check an node is an extend or shift operand
19651 unsigned Opcode = N.getOpcode();
19652 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19653 EVT SrcVT;
19654 if (Opcode == ISD::SIGN_EXTEND_INREG)
19655 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19656 else
19657 SrcVT = N.getOperand(0).getValueType();
19658
19659 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19660 } else if (Opcode == ISD::AND) {
19661 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19662 if (!CSD)
19663 return false;
19664 uint64_t AndMask = CSD->getZExtValue();
19665 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19666 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19667 return isa<ConstantSDNode>(N.getOperand(1));
19668 }
19669
19670 return false;
19671}
19672
19673// (N - Y) + Z --> (Z - Y) + N
19674// when N is an extend or shift operand
19676 SelectionDAG &DAG) {
19677 auto IsOneUseExtend = [](SDValue N) {
19678 return N.hasOneUse() && isExtendOrShiftOperand(N);
19679 };
19680
19681 // DAGCombiner will revert the combination when Z is constant cause
19682 // dead loop. So don't enable the combination when Z is constant.
19683 // If Z is one use shift C, we also can't do the optimization.
19684 // It will falling to self infinite loop.
19685 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19686 return SDValue();
19687
19688 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19689 return SDValue();
19690
19691 SDValue Shift = SUB.getOperand(0);
19692 if (!IsOneUseExtend(Shift))
19693 return SDValue();
19694
19695 SDLoc DL(N);
19696 EVT VT = N->getValueType(0);
19697
19698 SDValue Y = SUB.getOperand(1);
19699 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19700 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19701}
19702
19704 SelectionDAG &DAG) {
19705 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19706 // commutative.
19707 if (N->getOpcode() != ISD::ADD)
19708 return SDValue();
19709
19710 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19711 // shifted register is only available for i32 and i64.
19712 EVT VT = N->getValueType(0);
19713 if (VT != MVT::i32 && VT != MVT::i64)
19714 return SDValue();
19715
19716 SDLoc DL(N);
19717 SDValue LHS = N->getOperand(0);
19718 SDValue RHS = N->getOperand(1);
19719
19720 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19721 return Val;
19722 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19723 return Val;
19724
19725 uint64_t LHSImm = 0, RHSImm = 0;
19726 // If both operand are shifted by imm and shift amount is not greater than 4
19727 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19728 // on RHS.
19729 //
19730 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19731 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19732 // with LSL (shift > 4). For the rest of processors, this is no-op for
19733 // performance or correctness.
19734 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19735 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19736 RHSImm > 4 && LHS.hasOneUse())
19737 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19738
19739 return SDValue();
19740}
19741
19742// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19743// This reassociates it back to allow the creation of more mls instructions.
19745 if (N->getOpcode() != ISD::SUB)
19746 return SDValue();
19747
19748 SDValue Add = N->getOperand(1);
19749 SDValue X = N->getOperand(0);
19750 if (Add.getOpcode() != ISD::ADD)
19751 return SDValue();
19752
19753 if (!Add.hasOneUse())
19754 return SDValue();
19756 return SDValue();
19757
19758 SDValue M1 = Add.getOperand(0);
19759 SDValue M2 = Add.getOperand(1);
19760 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19761 M1.getOpcode() != AArch64ISD::UMULL)
19762 return SDValue();
19763 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19765 return SDValue();
19766
19767 EVT VT = N->getValueType(0);
19768 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19769 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19770}
19771
19772// Combine into mla/mls.
19773// This works on the patterns of:
19774// add v1, (mul v2, v3)
19775// sub v1, (mul v2, v3)
19776// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19777// It will transform the add/sub to a scalable version, so that we can
19778// make use of SVE's MLA/MLS that will be generated for that pattern
19779static SDValue
19781 SelectionDAG &DAG = DCI.DAG;
19782 // Make sure that the types are legal
19783 if (!DCI.isAfterLegalizeDAG())
19784 return SDValue();
19785 // Before using SVE's features, check first if it's available.
19786 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19787 return SDValue();
19788
19789 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19790 return SDValue();
19791
19792 if (!N->getValueType(0).isFixedLengthVector())
19793 return SDValue();
19794
19795 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19796 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19797 return SDValue();
19798
19799 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19800 return SDValue();
19801
19802 SDValue MulValue = Op1->getOperand(0);
19803 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19804 return SDValue();
19805
19806 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19807 return SDValue();
19808
19809 EVT ScalableVT = MulValue.getValueType();
19810 if (!ScalableVT.isScalableVector())
19811 return SDValue();
19812
19813 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19814 SDValue NewValue =
19815 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19816 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19817 };
19818
19819 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19820 return res;
19821 else if (N->getOpcode() == ISD::ADD)
19822 return performOpt(N->getOperand(1), N->getOperand(0));
19823
19824 return SDValue();
19825}
19826
19827// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19828// help, for example, to produce ssra from sshr+add.
19830 EVT VT = N->getValueType(0);
19831 if (VT != MVT::i64)
19832 return SDValue();
19833 SDValue Op0 = N->getOperand(0);
19834 SDValue Op1 = N->getOperand(1);
19835
19836 // At least one of the operands should be an extract, and the other should be
19837 // something that is easy to convert to v1i64 type (in this case a load).
19838 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19839 Op0.getOpcode() != ISD::LOAD)
19840 return SDValue();
19841 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19842 Op1.getOpcode() != ISD::LOAD)
19843 return SDValue();
19844
19845 SDLoc DL(N);
19846 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19847 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19848 Op0 = Op0.getOperand(0);
19849 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19850 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19851 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19852 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19853 Op1 = Op1.getOperand(0);
19854 } else
19855 return SDValue();
19856
19857 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19858 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19859 DAG.getConstant(0, DL, MVT::i64));
19860}
19861
19864 if (!BV->hasOneUse())
19865 return false;
19866 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19867 if (!Ld || !Ld->isSimple())
19868 return false;
19869 Loads.push_back(Ld);
19870 return true;
19871 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19873 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19874 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19875 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19876 return false;
19877 Loads.push_back(Ld);
19878 }
19879 return true;
19880 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19881 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19882 // are lowered. Note that this only comes up because we do not always visit
19883 // operands before uses. After that is fixed this can be removed and in the
19884 // meantime this is fairly specific to the lowering we expect from IR.
19885 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19886 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19887 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19888 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19889 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19890 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19891 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19892 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19893 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19894 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19895 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19896 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19897 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19898 B.getOperand(1).getNumOperands() != 4)
19899 return false;
19900 auto SV1 = cast<ShuffleVectorSDNode>(B);
19901 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19902 int NumElts = B.getValueType().getVectorNumElements();
19903 int NumSubElts = NumElts / 4;
19904 for (int I = 0; I < NumSubElts; I++) {
19905 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19906 if (SV1->getMaskElt(I) != I ||
19907 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19908 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19909 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19910 return false;
19911 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19912 if (SV2->getMaskElt(I) != I ||
19913 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19914 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19915 return false;
19916 }
19917 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19918 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19919 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19920 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19921 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19922 !Ld2->isSimple() || !Ld3->isSimple())
19923 return false;
19924 Loads.push_back(Ld0);
19925 Loads.push_back(Ld1);
19926 Loads.push_back(Ld2);
19927 Loads.push_back(Ld3);
19928 return true;
19929 }
19930 return false;
19931}
19932
19934 SelectionDAG &DAG,
19935 unsigned &NumSubLoads) {
19936 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19937 return false;
19938
19939 SmallVector<LoadSDNode *> Loads0, Loads1;
19940 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19941 isLoadOrMultipleLoads(Op1, Loads1)) {
19942 if (NumSubLoads && Loads0.size() != NumSubLoads)
19943 return false;
19944 NumSubLoads = Loads0.size();
19945 return Loads0.size() == Loads1.size() &&
19946 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19947 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19948 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19949 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19950 Size / 8, 1);
19951 });
19952 }
19953
19954 if (Op0.getOpcode() != Op1.getOpcode())
19955 return false;
19956
19957 switch (Op0.getOpcode()) {
19958 case ISD::ADD:
19959 case ISD::SUB:
19961 DAG, NumSubLoads) &&
19963 DAG, NumSubLoads);
19964 case ISD::SIGN_EXTEND:
19965 case ISD::ANY_EXTEND:
19966 case ISD::ZERO_EXTEND:
19967 EVT XVT = Op0.getOperand(0).getValueType();
19968 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19969 XVT.getScalarSizeInBits() != 32)
19970 return false;
19972 DAG, NumSubLoads);
19973 }
19974 return false;
19975}
19976
19977// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19978// into a single load of twice the size, that we extract the bottom part and top
19979// part so that the shl can use a shll2 instruction. The two loads in that
19980// example can also be larger trees of instructions, which are identical except
19981// for the leaves which are all loads offset from the LHS, including
19982// buildvectors of multiple loads. For example the RHS tree could be
19983// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19984// Whilst it can be common for the larger loads to replace LDP instructions
19985// (which doesn't gain anything on it's own), the larger loads can help create
19986// more efficient code, and in buildvectors prevent the need for ld1 lane
19987// inserts which can be slower than normal loads.
19989 EVT VT = N->getValueType(0);
19990 if (!VT.isFixedLengthVector() ||
19991 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19992 VT.getScalarSizeInBits() != 64))
19993 return SDValue();
19994
19995 SDValue Other = N->getOperand(0);
19996 SDValue Shift = N->getOperand(1);
19997 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19998 std::swap(Shift, Other);
19999 APInt ShiftAmt;
20000 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20001 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
20002 return SDValue();
20003
20004 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
20005 !ISD::isExtOpcode(Other.getOpcode()) ||
20006 Shift.getOperand(0).getOperand(0).getValueType() !=
20007 Other.getOperand(0).getValueType() ||
20008 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20009 return SDValue();
20010
20011 SDValue Op0 = Other.getOperand(0);
20012 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20013
20014 unsigned NumSubLoads = 0;
20015 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20016 return SDValue();
20017
20018 // Attempt to rule out some unprofitable cases using heuristics (some working
20019 // around suboptimal code generation), notably if the extend not be able to
20020 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20021 // will need to be created which can increase the instruction count.
20022 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20023 unsigned NumSubElts = NumElts / NumSubLoads;
20024 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20025 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20026 Op0.getValueType().getSizeInBits() < 128 &&
20028 return SDValue();
20029
20030 // Recreate the tree with the new combined loads.
20031 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20032 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20033 EVT DVT =
20035
20036 SmallVector<LoadSDNode *> Loads0, Loads1;
20037 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20038 isLoadOrMultipleLoads(Op1, Loads1)) {
20039 EVT LoadVT = EVT::getVectorVT(
20040 *DAG.getContext(), Op0.getValueType().getScalarType(),
20041 Op0.getValueType().getVectorNumElements() / Loads0.size());
20042 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20043
20044 SmallVector<SDValue> NewLoads;
20045 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20046 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20047 L0->getBasePtr(), L0->getPointerInfo(),
20048 L0->getOriginalAlign());
20049 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20050 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20051 NewLoads.push_back(Load);
20052 }
20053 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20054 }
20055
20057 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20058 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20059 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20060 };
20061 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20062
20063 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20064 int Hi = NumSubElts, Lo = 0;
20065 for (unsigned i = 0; i < NumSubLoads; i++) {
20066 for (unsigned j = 0; j < NumSubElts; j++) {
20067 LowMask[i * NumSubElts + j] = Lo++;
20068 HighMask[i * NumSubElts + j] = Hi++;
20069 }
20070 Lo += NumSubElts;
20071 Hi += NumSubElts;
20072 }
20073 SDLoc DL(N);
20074 SDValue Ext0, Ext1;
20075 // Extract the top and bottom lanes, then extend the result. Possibly extend
20076 // the result then extract the lanes if the two operands match as it produces
20077 // slightly smaller code.
20078 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20080 NewOp, DAG.getConstant(0, DL, MVT::i64));
20081 SDValue SubH =
20082 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20083 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20084 SDValue Extr0 =
20085 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20086 SDValue Extr1 =
20087 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20088 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20089 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20090 } else {
20092 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20093 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20094 DAG.getConstant(0, DL, MVT::i64));
20095 SDValue SubH =
20096 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20097 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20098 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20099 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20100 }
20101 SDValue NShift =
20102 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20103 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20104}
20105
20108 // Try to change sum of two reductions.
20109 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20110 return Val;
20111 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20112 return Val;
20113 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20114 return Val;
20115 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20116 return Val;
20118 return Val;
20120 return Val;
20121 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20122 return Val;
20123 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20124 return Val;
20125 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20126 return Val;
20127
20128 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20129 return Val;
20130
20131 return performAddSubLongCombine(N, DCI);
20132}
20133
20134// Massage DAGs which we can use the high-half "long" operations on into
20135// something isel will recognize better. E.g.
20136//
20137// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20138// (aarch64_neon_umull (extract_high (v2i64 vec)))
20139// (extract_high (v2i64 (dup128 scalar)))))
20140//
20143 SelectionDAG &DAG) {
20144 if (DCI.isBeforeLegalizeOps())
20145 return SDValue();
20146
20147 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20148 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20149 assert(LHS.getValueType().is64BitVector() &&
20150 RHS.getValueType().is64BitVector() &&
20151 "unexpected shape for long operation");
20152
20153 // Either node could be a DUP, but it's not worth doing both of them (you'd
20154 // just as well use the non-high version) so look for a corresponding extract
20155 // operation on the other "wing".
20158 if (!RHS.getNode())
20159 return SDValue();
20162 if (!LHS.getNode())
20163 return SDValue();
20164 } else
20165 return SDValue();
20166
20167 if (IID == Intrinsic::not_intrinsic)
20168 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20169
20170 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20171 N->getOperand(0), LHS, RHS);
20172}
20173
20174static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20175 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20176 unsigned ElemBits = ElemTy.getSizeInBits();
20177
20178 int64_t ShiftAmount;
20179 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20180 APInt SplatValue, SplatUndef;
20181 unsigned SplatBitSize;
20182 bool HasAnyUndefs;
20183 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20184 HasAnyUndefs, ElemBits) ||
20185 SplatBitSize != ElemBits)
20186 return SDValue();
20187
20188 ShiftAmount = SplatValue.getSExtValue();
20189 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20190 ShiftAmount = CVN->getSExtValue();
20191 } else
20192 return SDValue();
20193
20194 // If the shift amount is zero, remove the shift intrinsic.
20195 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20196 return N->getOperand(1);
20197
20198 unsigned Opcode;
20199 bool IsRightShift;
20200 switch (IID) {
20201 default:
20202 llvm_unreachable("Unknown shift intrinsic");
20203 case Intrinsic::aarch64_neon_sqshl:
20204 Opcode = AArch64ISD::SQSHL_I;
20205 IsRightShift = false;
20206 break;
20207 case Intrinsic::aarch64_neon_uqshl:
20208 Opcode = AArch64ISD::UQSHL_I;
20209 IsRightShift = false;
20210 break;
20211 case Intrinsic::aarch64_neon_srshl:
20212 Opcode = AArch64ISD::SRSHR_I;
20213 IsRightShift = true;
20214 break;
20215 case Intrinsic::aarch64_neon_urshl:
20216 Opcode = AArch64ISD::URSHR_I;
20217 IsRightShift = true;
20218 break;
20219 case Intrinsic::aarch64_neon_sqshlu:
20220 Opcode = AArch64ISD::SQSHLU_I;
20221 IsRightShift = false;
20222 break;
20223 case Intrinsic::aarch64_neon_sshl:
20224 case Intrinsic::aarch64_neon_ushl:
20225 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20226 // left shift for positive shift amounts. For negative shifts we can use a
20227 // VASHR/VLSHR as appropiate.
20228 if (ShiftAmount < 0) {
20229 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20231 ShiftAmount = -ShiftAmount;
20232 } else
20233 Opcode = AArch64ISD::VSHL;
20234 IsRightShift = false;
20235 break;
20236 }
20237
20238 EVT VT = N->getValueType(0);
20239 SDValue Op = N->getOperand(1);
20240 SDLoc dl(N);
20241 if (VT == MVT::i64) {
20242 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20243 VT = MVT::v1i64;
20244 }
20245
20246 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20247 Op = DAG.getNode(Opcode, dl, VT, Op,
20248 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20249 if (N->getValueType(0) == MVT::i64)
20250 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20251 DAG.getConstant(0, dl, MVT::i64));
20252 return Op;
20253 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20254 Op = DAG.getNode(Opcode, dl, VT, Op,
20255 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20256 if (N->getValueType(0) == MVT::i64)
20257 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20258 DAG.getConstant(0, dl, MVT::i64));
20259 return Op;
20260 }
20261
20262 return SDValue();
20263}
20264
20265// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20266// the intrinsics must be legal and take an i32, this means there's almost
20267// certainly going to be a zext in the DAG which we can eliminate.
20268static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20269 SDValue AndN = N->getOperand(2);
20270 if (AndN.getOpcode() != ISD::AND)
20271 return SDValue();
20272
20273 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20274 if (!CMask || CMask->getZExtValue() != Mask)
20275 return SDValue();
20276
20277 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20278 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20279}
20280
20282 SelectionDAG &DAG) {
20283 SDLoc dl(N);
20284 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20285 DAG.getNode(Opc, dl,
20286 N->getOperand(1).getSimpleValueType(),
20287 N->getOperand(1)),
20288 DAG.getConstant(0, dl, MVT::i64));
20289}
20290
20292 SDLoc DL(N);
20293 SDValue Op1 = N->getOperand(1);
20294 SDValue Op2 = N->getOperand(2);
20295 EVT ScalarTy = Op2.getValueType();
20296 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20297 ScalarTy = MVT::i32;
20298
20299 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20300 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20301 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20302 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20303 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20304 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20305}
20306
20308 SDLoc dl(N);
20309 SDValue Scalar = N->getOperand(3);
20310 EVT ScalarTy = Scalar.getValueType();
20311
20312 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20313 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20314
20315 SDValue Passthru = N->getOperand(1);
20316 SDValue Pred = N->getOperand(2);
20317 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20318 Pred, Scalar, Passthru);
20319}
20320
20322 SDLoc dl(N);
20323 LLVMContext &Ctx = *DAG.getContext();
20324 EVT VT = N->getValueType(0);
20325
20326 assert(VT.isScalableVector() && "Expected a scalable vector.");
20327
20328 // Current lowering only supports the SVE-ACLE types.
20330 return SDValue();
20331
20332 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20333 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20334 EVT ByteVT =
20335 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20336
20337 // Convert everything to the domain of EXT (i.e bytes).
20338 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20339 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20340 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20341 DAG.getConstant(ElemSize, dl, MVT::i32));
20342
20343 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20344 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20345}
20346
20349 SelectionDAG &DAG) {
20350 if (DCI.isBeforeLegalize())
20351 return SDValue();
20352
20353 SDValue Comparator = N->getOperand(3);
20354 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20355 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20356 unsigned IID = getIntrinsicID(N);
20357 EVT VT = N->getValueType(0);
20358 EVT CmpVT = N->getOperand(2).getValueType();
20359 SDValue Pred = N->getOperand(1);
20360 SDValue Imm;
20361 SDLoc DL(N);
20362
20363 switch (IID) {
20364 default:
20365 llvm_unreachable("Called with wrong intrinsic!");
20366 break;
20367
20368 // Signed comparisons
20369 case Intrinsic::aarch64_sve_cmpeq_wide:
20370 case Intrinsic::aarch64_sve_cmpne_wide:
20371 case Intrinsic::aarch64_sve_cmpge_wide:
20372 case Intrinsic::aarch64_sve_cmpgt_wide:
20373 case Intrinsic::aarch64_sve_cmplt_wide:
20374 case Intrinsic::aarch64_sve_cmple_wide: {
20375 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20376 int64_t ImmVal = CN->getSExtValue();
20377 if (ImmVal >= -16 && ImmVal <= 15)
20378 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20379 else
20380 return SDValue();
20381 }
20382 break;
20383 }
20384 // Unsigned comparisons
20385 case Intrinsic::aarch64_sve_cmphs_wide:
20386 case Intrinsic::aarch64_sve_cmphi_wide:
20387 case Intrinsic::aarch64_sve_cmplo_wide:
20388 case Intrinsic::aarch64_sve_cmpls_wide: {
20389 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20390 uint64_t ImmVal = CN->getZExtValue();
20391 if (ImmVal <= 127)
20392 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20393 else
20394 return SDValue();
20395 }
20396 break;
20397 }
20398 }
20399
20400 if (!Imm)
20401 return SDValue();
20402
20403 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20404 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20405 N->getOperand(2), Splat, DAG.getCondCode(CC));
20406 }
20407
20408 return SDValue();
20409}
20410
20413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20414
20415 SDLoc DL(Op);
20416 assert(Op.getValueType().isScalableVector() &&
20417 TLI.isTypeLegal(Op.getValueType()) &&
20418 "Expected legal scalable vector type!");
20419 assert(Op.getValueType() == Pg.getValueType() &&
20420 "Expected same type for PTEST operands");
20421
20422 // Ensure target specific opcodes are using legal type.
20423 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20424 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20425 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20426
20427 // Ensure operands have type nxv16i1.
20428 if (Op.getValueType() != MVT::nxv16i1) {
20431 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20432 else
20433 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20434 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20435 }
20436
20437 // Set condition code (CC) flags.
20438 SDValue Test = DAG.getNode(
20440 DL, MVT::Other, Pg, Op);
20441
20442 // Convert CC to integer based on requested condition.
20443 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20444 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20445 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20446 return DAG.getZExtOrTrunc(Res, DL, VT);
20447}
20448
20450 SelectionDAG &DAG) {
20451 SDLoc DL(N);
20452
20453 SDValue Pred = N->getOperand(1);
20454 SDValue VecToReduce = N->getOperand(2);
20455
20456 // NOTE: The integer reduction's result type is not always linked to the
20457 // operand's element type so we construct it from the intrinsic's result type.
20458 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20459 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20460
20461 // SVE reductions set the whole vector register with the first element
20462 // containing the reduction result, which we'll now extract.
20463 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20464 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20465 Zero);
20466}
20467
20469 SelectionDAG &DAG) {
20470 SDLoc DL(N);
20471
20472 SDValue Pred = N->getOperand(1);
20473 SDValue VecToReduce = N->getOperand(2);
20474
20475 EVT ReduceVT = VecToReduce.getValueType();
20476 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20477
20478 // SVE reductions set the whole vector register with the first element
20479 // containing the reduction result, which we'll now extract.
20480 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20481 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20482 Zero);
20483}
20484
20486 SelectionDAG &DAG) {
20487 SDLoc DL(N);
20488
20489 SDValue Pred = N->getOperand(1);
20490 SDValue InitVal = N->getOperand(2);
20491 SDValue VecToReduce = N->getOperand(3);
20492 EVT ReduceVT = VecToReduce.getValueType();
20493
20494 // Ordered reductions use the first lane of the result vector as the
20495 // reduction's initial value.
20496 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20497 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20498 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20499
20500 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20501
20502 // SVE reductions set the whole vector register with the first element
20503 // containing the reduction result, which we'll now extract.
20504 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20505 Zero);
20506}
20507
20508// If a merged operation has no inactive lanes we can relax it to a predicated
20509// or unpredicated operation, which potentially allows better isel (perhaps
20510// using immediate forms) or relaxing register reuse requirements.
20512 SelectionDAG &DAG, bool UnpredOp = false,
20513 bool SwapOperands = false) {
20514 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20515 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20516 SDValue Pg = N->getOperand(1);
20517 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20518 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20519
20520 // ISD way to specify an all active predicate.
20521 if (isAllActivePredicate(DAG, Pg)) {
20522 if (UnpredOp)
20523 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20524
20525 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20526 }
20527
20528 // FUTURE: SplatVector(true)
20529 return SDValue();
20530}
20531
20534 const AArch64Subtarget *Subtarget) {
20535 if (DCI.isBeforeLegalize())
20536 return SDValue();
20537
20538 if (!Subtarget->hasSVE2p1())
20539 return SDValue();
20540
20541 if (!N->hasNUsesOfValue(2, 0))
20542 return SDValue();
20543
20544 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
20545 if (HalfSize < 2)
20546 return SDValue();
20547
20548 auto It = N->use_begin();
20549 SDNode *Lo = *It++;
20550 SDNode *Hi = *It;
20551
20552 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
20553 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
20554 return SDValue();
20555
20556 uint64_t OffLo = Lo->getConstantOperandVal(1);
20557 uint64_t OffHi = Hi->getConstantOperandVal(1);
20558
20559 if (OffLo > OffHi) {
20560 std::swap(Lo, Hi);
20561 std::swap(OffLo, OffHi);
20562 }
20563
20564 if (OffLo != 0 || OffHi != HalfSize)
20565 return SDValue();
20566
20567 EVT HalfVec = Lo->getValueType(0);
20568 if (HalfVec != Hi->getValueType(0) ||
20569 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
20570 return SDValue();
20571
20572 SelectionDAG &DAG = DCI.DAG;
20573 SDLoc DL(N);
20574 SDValue ID =
20575 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
20576 SDValue Idx = N->getOperand(1);
20577 SDValue TC = N->getOperand(2);
20578 if (Idx.getValueType() != MVT::i64) {
20579 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
20580 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
20581 }
20582 auto R =
20584 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
20585
20586 DCI.CombineTo(Lo, R.getValue(0));
20587 DCI.CombineTo(Hi, R.getValue(1));
20588
20589 return SDValue(N, 0);
20590}
20591
20594 const AArch64Subtarget *Subtarget) {
20595 SelectionDAG &DAG = DCI.DAG;
20596 unsigned IID = getIntrinsicID(N);
20597 switch (IID) {
20598 default:
20599 break;
20600 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20601 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20602 return tryCombineFixedPointConvert(N, DCI, DAG);
20603 case Intrinsic::aarch64_neon_saddv:
20605 case Intrinsic::aarch64_neon_uaddv:
20607 case Intrinsic::aarch64_neon_sminv:
20609 case Intrinsic::aarch64_neon_uminv:
20611 case Intrinsic::aarch64_neon_smaxv:
20613 case Intrinsic::aarch64_neon_umaxv:
20615 case Intrinsic::aarch64_neon_fmax:
20616 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20617 N->getOperand(1), N->getOperand(2));
20618 case Intrinsic::aarch64_neon_fmin:
20619 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20620 N->getOperand(1), N->getOperand(2));
20621 case Intrinsic::aarch64_neon_fmaxnm:
20622 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20623 N->getOperand(1), N->getOperand(2));
20624 case Intrinsic::aarch64_neon_fminnm:
20625 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20626 N->getOperand(1), N->getOperand(2));
20627 case Intrinsic::aarch64_neon_smull:
20628 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20629 N->getOperand(1), N->getOperand(2));
20630 case Intrinsic::aarch64_neon_umull:
20631 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20632 N->getOperand(1), N->getOperand(2));
20633 case Intrinsic::aarch64_neon_pmull:
20634 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20635 N->getOperand(1), N->getOperand(2));
20636 case Intrinsic::aarch64_neon_sqdmull:
20637 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20638 case Intrinsic::aarch64_neon_sqshl:
20639 case Intrinsic::aarch64_neon_uqshl:
20640 case Intrinsic::aarch64_neon_sqshlu:
20641 case Intrinsic::aarch64_neon_srshl:
20642 case Intrinsic::aarch64_neon_urshl:
20643 case Intrinsic::aarch64_neon_sshl:
20644 case Intrinsic::aarch64_neon_ushl:
20645 return tryCombineShiftImm(IID, N, DAG);
20646 case Intrinsic::aarch64_neon_sabd:
20647 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20648 N->getOperand(1), N->getOperand(2));
20649 case Intrinsic::aarch64_neon_uabd:
20650 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20651 N->getOperand(1), N->getOperand(2));
20652 case Intrinsic::aarch64_crc32b:
20653 case Intrinsic::aarch64_crc32cb:
20654 return tryCombineCRC32(0xff, N, DAG);
20655 case Intrinsic::aarch64_crc32h:
20656 case Intrinsic::aarch64_crc32ch:
20657 return tryCombineCRC32(0xffff, N, DAG);
20658 case Intrinsic::aarch64_sve_saddv:
20659 // There is no i64 version of SADDV because the sign is irrelevant.
20660 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20662 else
20664 case Intrinsic::aarch64_sve_uaddv:
20666 case Intrinsic::aarch64_sve_smaxv:
20668 case Intrinsic::aarch64_sve_umaxv:
20670 case Intrinsic::aarch64_sve_sminv:
20672 case Intrinsic::aarch64_sve_uminv:
20674 case Intrinsic::aarch64_sve_orv:
20676 case Intrinsic::aarch64_sve_eorv:
20678 case Intrinsic::aarch64_sve_andv:
20680 case Intrinsic::aarch64_sve_index:
20681 return LowerSVEIntrinsicIndex(N, DAG);
20682 case Intrinsic::aarch64_sve_dup:
20683 return LowerSVEIntrinsicDUP(N, DAG);
20684 case Intrinsic::aarch64_sve_dup_x:
20685 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20686 N->getOperand(1));
20687 case Intrinsic::aarch64_sve_ext:
20688 return LowerSVEIntrinsicEXT(N, DAG);
20689 case Intrinsic::aarch64_sve_mul_u:
20690 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20691 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20692 case Intrinsic::aarch64_sve_smulh_u:
20693 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20694 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20695 case Intrinsic::aarch64_sve_umulh_u:
20696 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20697 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20698 case Intrinsic::aarch64_sve_smin_u:
20699 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20700 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20701 case Intrinsic::aarch64_sve_umin_u:
20702 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20703 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20704 case Intrinsic::aarch64_sve_smax_u:
20705 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20706 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20707 case Intrinsic::aarch64_sve_umax_u:
20708 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20709 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20710 case Intrinsic::aarch64_sve_lsl_u:
20711 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20712 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20713 case Intrinsic::aarch64_sve_lsr_u:
20714 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20715 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20716 case Intrinsic::aarch64_sve_asr_u:
20717 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20718 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20719 case Intrinsic::aarch64_sve_fadd_u:
20720 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20721 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20722 case Intrinsic::aarch64_sve_fdiv_u:
20723 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20724 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20725 case Intrinsic::aarch64_sve_fmax_u:
20726 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20727 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20728 case Intrinsic::aarch64_sve_fmaxnm_u:
20729 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20730 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20731 case Intrinsic::aarch64_sve_fmla_u:
20732 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20733 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20734 N->getOperand(2));
20735 case Intrinsic::aarch64_sve_fmin_u:
20736 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20737 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20738 case Intrinsic::aarch64_sve_fminnm_u:
20739 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20740 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20741 case Intrinsic::aarch64_sve_fmul_u:
20742 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20743 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20744 case Intrinsic::aarch64_sve_fsub_u:
20745 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20746 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20747 case Intrinsic::aarch64_sve_add_u:
20748 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20749 N->getOperand(3));
20750 case Intrinsic::aarch64_sve_sub_u:
20751 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20752 N->getOperand(3));
20753 case Intrinsic::aarch64_sve_subr:
20754 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20755 case Intrinsic::aarch64_sve_and_u:
20756 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20757 N->getOperand(3));
20758 case Intrinsic::aarch64_sve_bic_u:
20759 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20760 N->getOperand(2), N->getOperand(3));
20761 case Intrinsic::aarch64_sve_eor_u:
20762 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20763 N->getOperand(3));
20764 case Intrinsic::aarch64_sve_orr_u:
20765 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20766 N->getOperand(3));
20767 case Intrinsic::aarch64_sve_sabd_u:
20768 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20769 N->getOperand(2), N->getOperand(3));
20770 case Intrinsic::aarch64_sve_uabd_u:
20771 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20772 N->getOperand(2), N->getOperand(3));
20773 case Intrinsic::aarch64_sve_sdiv_u:
20774 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20775 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20776 case Intrinsic::aarch64_sve_udiv_u:
20777 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20778 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20779 case Intrinsic::aarch64_sve_sqadd:
20780 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20781 case Intrinsic::aarch64_sve_sqsub_u:
20782 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20783 N->getOperand(2), N->getOperand(3));
20784 case Intrinsic::aarch64_sve_uqadd:
20785 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20786 case Intrinsic::aarch64_sve_uqsub_u:
20787 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20788 N->getOperand(2), N->getOperand(3));
20789 case Intrinsic::aarch64_sve_sqadd_x:
20790 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20791 N->getOperand(1), N->getOperand(2));
20792 case Intrinsic::aarch64_sve_sqsub_x:
20793 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20794 N->getOperand(1), N->getOperand(2));
20795 case Intrinsic::aarch64_sve_uqadd_x:
20796 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20797 N->getOperand(1), N->getOperand(2));
20798 case Intrinsic::aarch64_sve_uqsub_x:
20799 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20800 N->getOperand(1), N->getOperand(2));
20801 case Intrinsic::aarch64_sve_asrd:
20802 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20803 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20804 case Intrinsic::aarch64_sve_cmphs:
20805 if (!N->getOperand(2).getValueType().isFloatingPoint())
20807 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20808 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20809 break;
20810 case Intrinsic::aarch64_sve_cmphi:
20811 if (!N->getOperand(2).getValueType().isFloatingPoint())
20813 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20814 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20815 break;
20816 case Intrinsic::aarch64_sve_fcmpge:
20817 case Intrinsic::aarch64_sve_cmpge:
20819 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20820 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20821 break;
20822 case Intrinsic::aarch64_sve_fcmpgt:
20823 case Intrinsic::aarch64_sve_cmpgt:
20825 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20826 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20827 break;
20828 case Intrinsic::aarch64_sve_fcmpeq:
20829 case Intrinsic::aarch64_sve_cmpeq:
20831 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20832 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20833 break;
20834 case Intrinsic::aarch64_sve_fcmpne:
20835 case Intrinsic::aarch64_sve_cmpne:
20837 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20838 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20839 break;
20840 case Intrinsic::aarch64_sve_fcmpuo:
20842 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20843 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20844 break;
20845 case Intrinsic::aarch64_sve_fadda:
20847 case Intrinsic::aarch64_sve_faddv:
20849 case Intrinsic::aarch64_sve_fmaxnmv:
20851 case Intrinsic::aarch64_sve_fmaxv:
20853 case Intrinsic::aarch64_sve_fminnmv:
20855 case Intrinsic::aarch64_sve_fminv:
20857 case Intrinsic::aarch64_sve_sel:
20858 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20859 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20860 case Intrinsic::aarch64_sve_cmpeq_wide:
20861 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20862 case Intrinsic::aarch64_sve_cmpne_wide:
20863 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20864 case Intrinsic::aarch64_sve_cmpge_wide:
20865 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20866 case Intrinsic::aarch64_sve_cmpgt_wide:
20867 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20868 case Intrinsic::aarch64_sve_cmplt_wide:
20869 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20870 case Intrinsic::aarch64_sve_cmple_wide:
20871 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20872 case Intrinsic::aarch64_sve_cmphs_wide:
20873 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20874 case Intrinsic::aarch64_sve_cmphi_wide:
20875 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20876 case Intrinsic::aarch64_sve_cmplo_wide:
20877 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20878 case Intrinsic::aarch64_sve_cmpls_wide:
20879 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20880 case Intrinsic::aarch64_sve_ptest_any:
20881 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20883 case Intrinsic::aarch64_sve_ptest_first:
20884 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20886 case Intrinsic::aarch64_sve_ptest_last:
20887 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20889 case Intrinsic::aarch64_sve_whilelo:
20890 return tryCombineWhileLo(N, DCI, Subtarget);
20891 }
20892 return SDValue();
20893}
20894
20895static bool isCheapToExtend(const SDValue &N) {
20896 unsigned OC = N->getOpcode();
20897 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20899}
20900
20901static SDValue
20903 SelectionDAG &DAG) {
20904 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20905 // we can move the sext into the arguments and have the same result. For
20906 // example, if A and B are both loads, we can make those extending loads and
20907 // avoid an extra instruction. This pattern appears often in VLS code
20908 // generation where the inputs to the setcc have a different size to the
20909 // instruction that wants to use the result of the setcc.
20910 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20911 N->getOperand(0)->getOpcode() == ISD::SETCC);
20912 const SDValue SetCC = N->getOperand(0);
20913
20914 const SDValue CCOp0 = SetCC.getOperand(0);
20915 const SDValue CCOp1 = SetCC.getOperand(1);
20916 if (!CCOp0->getValueType(0).isInteger() ||
20917 !CCOp1->getValueType(0).isInteger())
20918 return SDValue();
20919
20920 ISD::CondCode Code =
20921 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20922
20923 ISD::NodeType ExtType =
20924 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20925
20926 if (isCheapToExtend(SetCC.getOperand(0)) &&
20927 isCheapToExtend(SetCC.getOperand(1))) {
20928 const SDValue Ext1 =
20929 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20930 const SDValue Ext2 =
20931 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20932
20933 return DAG.getSetCC(
20934 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20935 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20936 }
20937
20938 return SDValue();
20939}
20940
20943 SelectionDAG &DAG) {
20944 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20945 // we can convert that DUP into another extract_high (of a bigger DUP), which
20946 // helps the backend to decide that an sabdl2 would be useful, saving a real
20947 // extract_high operation.
20948 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20949 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20950 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20951 SDNode *ABDNode = N->getOperand(0).getNode();
20952 SDValue NewABD =
20954 if (!NewABD.getNode())
20955 return SDValue();
20956
20957 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20958 }
20959
20960 if (N->getValueType(0).isFixedLengthVector() &&
20961 N->getOpcode() == ISD::SIGN_EXTEND &&
20962 N->getOperand(0)->getOpcode() == ISD::SETCC)
20963 return performSignExtendSetCCCombine(N, DCI, DAG);
20964
20965 return SDValue();
20966}
20967
20969 SDValue SplatVal, unsigned NumVecElts) {
20970 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20971 Align OrigAlignment = St.getAlign();
20972 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20973
20974 // Create scalar stores. This is at least as good as the code sequence for a
20975 // split unaligned store which is a dup.s, ext.b, and two stores.
20976 // Most of the time the three stores should be replaced by store pair
20977 // instructions (stp).
20978 SDLoc DL(&St);
20979 SDValue BasePtr = St.getBasePtr();
20980 uint64_t BaseOffset = 0;
20981
20982 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20983 SDValue NewST1 =
20984 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20985 OrigAlignment, St.getMemOperand()->getFlags());
20986
20987 // As this in ISel, we will not merge this add which may degrade results.
20988 if (BasePtr->getOpcode() == ISD::ADD &&
20989 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20990 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20991 BasePtr = BasePtr->getOperand(0);
20992 }
20993
20994 unsigned Offset = EltOffset;
20995 while (--NumVecElts) {
20996 Align Alignment = commonAlignment(OrigAlignment, Offset);
20997 SDValue OffsetPtr =
20998 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20999 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
21000 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
21001 PtrInfo.getWithOffset(Offset), Alignment,
21002 St.getMemOperand()->getFlags());
21003 Offset += EltOffset;
21004 }
21005 return NewST1;
21006}
21007
21008// Returns an SVE type that ContentTy can be trivially sign or zero extended
21009// into.
21010static MVT getSVEContainerType(EVT ContentTy) {
21011 assert(ContentTy.isSimple() && "No SVE containers for extended types");
21012
21013 switch (ContentTy.getSimpleVT().SimpleTy) {
21014 default:
21015 llvm_unreachable("No known SVE container for this MVT type");
21016 case MVT::nxv2i8:
21017 case MVT::nxv2i16:
21018 case MVT::nxv2i32:
21019 case MVT::nxv2i64:
21020 case MVT::nxv2f32:
21021 case MVT::nxv2f64:
21022 return MVT::nxv2i64;
21023 case MVT::nxv4i8:
21024 case MVT::nxv4i16:
21025 case MVT::nxv4i32:
21026 case MVT::nxv4f32:
21027 return MVT::nxv4i32;
21028 case MVT::nxv8i8:
21029 case MVT::nxv8i16:
21030 case MVT::nxv8f16:
21031 case MVT::nxv8bf16:
21032 return MVT::nxv8i16;
21033 case MVT::nxv16i8:
21034 return MVT::nxv16i8;
21035 }
21036}
21037
21038static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21039 SDLoc DL(N);
21040 EVT VT = N->getValueType(0);
21041
21043 return SDValue();
21044
21045 EVT ContainerVT = VT;
21046 if (ContainerVT.isInteger())
21047 ContainerVT = getSVEContainerType(ContainerVT);
21048
21049 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21050 SDValue Ops[] = { N->getOperand(0), // Chain
21051 N->getOperand(2), // Pg
21052 N->getOperand(3), // Base
21053 DAG.getValueType(VT) };
21054
21055 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21056 SDValue LoadChain = SDValue(Load.getNode(), 1);
21057
21058 if (ContainerVT.isInteger() && (VT != ContainerVT))
21059 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21060
21061 return DAG.getMergeValues({ Load, LoadChain }, DL);
21062}
21063
21065 SDLoc DL(N);
21066 EVT VT = N->getValueType(0);
21067 EVT PtrTy = N->getOperand(3).getValueType();
21068
21069 EVT LoadVT = VT;
21070 if (VT.isFloatingPoint())
21071 LoadVT = VT.changeTypeToInteger();
21072
21073 auto *MINode = cast<MemIntrinsicSDNode>(N);
21074 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21075 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21076 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21077 MINode->getOperand(2), PassThru,
21078 MINode->getMemoryVT(), MINode->getMemOperand(),
21080
21081 if (VT.isFloatingPoint()) {
21082 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21083 return DAG.getMergeValues(Ops, DL);
21084 }
21085
21086 return L;
21087}
21088
21089template <unsigned Opcode>
21091 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21093 "Unsupported opcode.");
21094 SDLoc DL(N);
21095 EVT VT = N->getValueType(0);
21096
21097 EVT LoadVT = VT;
21098 if (VT.isFloatingPoint())
21099 LoadVT = VT.changeTypeToInteger();
21100
21101 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21102 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21103 SDValue LoadChain = SDValue(Load.getNode(), 1);
21104
21105 if (VT.isFloatingPoint())
21106 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21107
21108 return DAG.getMergeValues({Load, LoadChain}, DL);
21109}
21110
21112 SDLoc DL(N);
21113 SDValue Data = N->getOperand(2);
21114 EVT DataVT = Data.getValueType();
21115 EVT HwSrcVt = getSVEContainerType(DataVT);
21116 SDValue InputVT = DAG.getValueType(DataVT);
21117
21118 if (DataVT.isFloatingPoint())
21119 InputVT = DAG.getValueType(HwSrcVt);
21120
21121 SDValue SrcNew;
21122 if (Data.getValueType().isFloatingPoint())
21123 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21124 else
21125 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21126
21127 SDValue Ops[] = { N->getOperand(0), // Chain
21128 SrcNew,
21129 N->getOperand(4), // Base
21130 N->getOperand(3), // Pg
21131 InputVT
21132 };
21133
21134 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21135}
21136
21138 SDLoc DL(N);
21139
21140 SDValue Data = N->getOperand(2);
21141 EVT DataVT = Data.getValueType();
21142 EVT PtrTy = N->getOperand(4).getValueType();
21143
21144 if (DataVT.isFloatingPoint())
21146
21147 auto *MINode = cast<MemIntrinsicSDNode>(N);
21148 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21149 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21150 MINode->getMemoryVT(), MINode->getMemOperand(),
21151 ISD::UNINDEXED, false, false);
21152}
21153
21154/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21155/// load store optimizer pass will merge them to store pair stores. This should
21156/// be better than a movi to create the vector zero followed by a vector store
21157/// if the zero constant is not re-used, since one instructions and one register
21158/// live range will be removed.
21159///
21160/// For example, the final generated code should be:
21161///
21162/// stp xzr, xzr, [x0]
21163///
21164/// instead of:
21165///
21166/// movi v0.2d, #0
21167/// str q0, [x0]
21168///
21170 SDValue StVal = St.getValue();
21171 EVT VT = StVal.getValueType();
21172
21173 // Avoid scalarizing zero splat stores for scalable vectors.
21174 if (VT.isScalableVector())
21175 return SDValue();
21176
21177 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21178 // 2, 3 or 4 i32 elements.
21179 int NumVecElts = VT.getVectorNumElements();
21180 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21181 VT.getVectorElementType().getSizeInBits() == 64) ||
21182 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21183 VT.getVectorElementType().getSizeInBits() == 32)))
21184 return SDValue();
21185
21186 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21187 return SDValue();
21188
21189 // If the zero constant has more than one use then the vector store could be
21190 // better since the constant mov will be amortized and stp q instructions
21191 // should be able to be formed.
21192 if (!StVal.hasOneUse())
21193 return SDValue();
21194
21195 // If the store is truncating then it's going down to i16 or smaller, which
21196 // means it can be implemented in a single store anyway.
21197 if (St.isTruncatingStore())
21198 return SDValue();
21199
21200 // If the immediate offset of the address operand is too large for the stp
21201 // instruction, then bail out.
21202 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21203 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21204 if (Offset < -512 || Offset > 504)
21205 return SDValue();
21206 }
21207
21208 for (int I = 0; I < NumVecElts; ++I) {
21209 SDValue EltVal = StVal.getOperand(I);
21210 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21211 return SDValue();
21212 }
21213
21214 // Use a CopyFromReg WZR/XZR here to prevent
21215 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21216 SDLoc DL(&St);
21217 unsigned ZeroReg;
21218 EVT ZeroVT;
21219 if (VT.getVectorElementType().getSizeInBits() == 32) {
21220 ZeroReg = AArch64::WZR;
21221 ZeroVT = MVT::i32;
21222 } else {
21223 ZeroReg = AArch64::XZR;
21224 ZeroVT = MVT::i64;
21225 }
21226 SDValue SplatVal =
21227 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21228 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21229}
21230
21231/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21232/// value. The load store optimizer pass will merge them to store pair stores.
21233/// This has better performance than a splat of the scalar followed by a split
21234/// vector store. Even if the stores are not merged it is four stores vs a dup,
21235/// followed by an ext.b and two stores.
21237 SDValue StVal = St.getValue();
21238 EVT VT = StVal.getValueType();
21239
21240 // Don't replace floating point stores, they possibly won't be transformed to
21241 // stp because of the store pair suppress pass.
21242 if (VT.isFloatingPoint())
21243 return SDValue();
21244
21245 // We can express a splat as store pair(s) for 2 or 4 elements.
21246 unsigned NumVecElts = VT.getVectorNumElements();
21247 if (NumVecElts != 4 && NumVecElts != 2)
21248 return SDValue();
21249
21250 // If the store is truncating then it's going down to i16 or smaller, which
21251 // means it can be implemented in a single store anyway.
21252 if (St.isTruncatingStore())
21253 return SDValue();
21254
21255 // Check that this is a splat.
21256 // Make sure that each of the relevant vector element locations are inserted
21257 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21258 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21259 SDValue SplatVal;
21260 for (unsigned I = 0; I < NumVecElts; ++I) {
21261 // Check for insert vector elements.
21262 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21263 return SDValue();
21264
21265 // Check that same value is inserted at each vector element.
21266 if (I == 0)
21267 SplatVal = StVal.getOperand(1);
21268 else if (StVal.getOperand(1) != SplatVal)
21269 return SDValue();
21270
21271 // Check insert element index.
21272 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21273 if (!CIndex)
21274 return SDValue();
21275 uint64_t IndexVal = CIndex->getZExtValue();
21276 if (IndexVal >= NumVecElts)
21277 return SDValue();
21278 IndexNotInserted.reset(IndexVal);
21279
21280 StVal = StVal.getOperand(0);
21281 }
21282 // Check that all vector element locations were inserted to.
21283 if (IndexNotInserted.any())
21284 return SDValue();
21285
21286 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21287}
21288
21290 SelectionDAG &DAG,
21291 const AArch64Subtarget *Subtarget) {
21292
21293 StoreSDNode *S = cast<StoreSDNode>(N);
21294 if (S->isVolatile() || S->isIndexed())
21295 return SDValue();
21296
21297 SDValue StVal = S->getValue();
21298 EVT VT = StVal.getValueType();
21299
21300 if (!VT.isFixedLengthVector())
21301 return SDValue();
21302
21303 // If we get a splat of zeros, convert this vector store to a store of
21304 // scalars. They will be merged into store pairs of xzr thereby removing one
21305 // instruction and one register.
21306 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21307 return ReplacedZeroSplat;
21308
21309 // FIXME: The logic for deciding if an unaligned store should be split should
21310 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21311 // a call to that function here.
21312
21313 if (!Subtarget->isMisaligned128StoreSlow())
21314 return SDValue();
21315
21316 // Don't split at -Oz.
21318 return SDValue();
21319
21320 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21321 // those up regresses performance on micro-benchmarks and olden/bh.
21322 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21323 return SDValue();
21324
21325 // Split unaligned 16B stores. They are terrible for performance.
21326 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21327 // extensions can use this to mark that it does not want splitting to happen
21328 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21329 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21330 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21331 S->getAlign() <= Align(2))
21332 return SDValue();
21333
21334 // If we get a splat of a scalar convert this vector store to a store of
21335 // scalars. They will be merged into store pairs thereby removing two
21336 // instructions.
21337 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21338 return ReplacedSplat;
21339
21340 SDLoc DL(S);
21341
21342 // Split VT into two.
21343 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21344 unsigned NumElts = HalfVT.getVectorNumElements();
21345 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21346 DAG.getConstant(0, DL, MVT::i64));
21347 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21348 DAG.getConstant(NumElts, DL, MVT::i64));
21349 SDValue BasePtr = S->getBasePtr();
21350 SDValue NewST1 =
21351 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21352 S->getAlign(), S->getMemOperand()->getFlags());
21353 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21354 DAG.getConstant(8, DL, MVT::i64));
21355 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21356 S->getPointerInfo(), S->getAlign(),
21357 S->getMemOperand()->getFlags());
21358}
21359
21361 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21362
21363 // splice(pg, op1, undef) -> op1
21364 if (N->getOperand(2).isUndef())
21365 return N->getOperand(1);
21366
21367 return SDValue();
21368}
21369
21371 const AArch64Subtarget *Subtarget) {
21372 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21373 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21374 "Unexpected Opcode!");
21375
21376 // uunpklo/hi undef -> undef
21377 if (N->getOperand(0).isUndef())
21378 return DAG.getUNDEF(N->getValueType(0));
21379
21380 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21381 // extending load. We can do this even if this is already a masked
21382 // {z,}extload.
21383 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21384 N->getOpcode() == AArch64ISD::UUNPKLO) {
21385 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21386 SDValue Mask = MLD->getMask();
21387 SDLoc DL(N);
21388
21389 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21390 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21391 (MLD->getPassThru()->isUndef() ||
21392 isZerosVector(MLD->getPassThru().getNode()))) {
21393 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21394 unsigned PgPattern = Mask->getConstantOperandVal(0);
21395 EVT VT = N->getValueType(0);
21396
21397 // Ensure we can double the size of the predicate pattern
21398 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21399 if (NumElts &&
21400 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21401 Mask =
21402 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21403 SDValue PassThru = DAG.getConstant(0, DL, VT);
21404 SDValue NewLoad = DAG.getMaskedLoad(
21405 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21406 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21408
21409 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21410
21411 return NewLoad;
21412 }
21413 }
21414 }
21415
21416 return SDValue();
21417}
21418
21420 if (N->getOpcode() != AArch64ISD::UZP1)
21421 return false;
21422 SDValue Op0 = N->getOperand(0);
21423 EVT SrcVT = Op0->getValueType(0);
21424 EVT DstVT = N->getValueType(0);
21425 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21426 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21427 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21428}
21429
21430// Try to combine rounding shifts where the operands come from an extend, and
21431// the result is truncated and combined into one vector.
21432// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21434 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21435 SDValue Op0 = N->getOperand(0);
21436 SDValue Op1 = N->getOperand(1);
21437 EVT ResVT = N->getValueType(0);
21438
21439 unsigned RshOpc = Op0.getOpcode();
21440 if (RshOpc != AArch64ISD::RSHRNB_I)
21441 return SDValue();
21442
21443 // Same op code and imm value?
21444 SDValue ShiftValue = Op0.getOperand(1);
21445 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21446 return SDValue();
21447
21448 // Same unextended operand value?
21449 SDValue Lo = Op0.getOperand(0);
21450 SDValue Hi = Op1.getOperand(0);
21451 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21452 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21453 return SDValue();
21454 SDValue OrigArg = Lo.getOperand(0);
21455 if (OrigArg != Hi.getOperand(0))
21456 return SDValue();
21457
21458 SDLoc DL(N);
21459 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21460 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21461 ShiftValue);
21462}
21463
21464// Try to simplify:
21465// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21466// t2 = nxv8i16 srl(t1, ShiftValue)
21467// to
21468// t1 = nxv8i16 rshrnb(X, shiftvalue).
21469// rshrnb will zero the top half bits of each element. Therefore, this combine
21470// should only be performed when a following instruction with the rshrnb
21471// as an operand does not care about the top half of each element. For example,
21472// a uzp1 or a truncating store.
21474 const AArch64Subtarget *Subtarget) {
21475 EVT VT = Srl->getValueType(0);
21476 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21477 return SDValue();
21478
21479 EVT ResVT;
21480 if (VT == MVT::nxv8i16)
21481 ResVT = MVT::nxv16i8;
21482 else if (VT == MVT::nxv4i32)
21483 ResVT = MVT::nxv8i16;
21484 else if (VT == MVT::nxv2i64)
21485 ResVT = MVT::nxv4i32;
21486 else
21487 return SDValue();
21488
21489 SDLoc DL(Srl);
21490 unsigned ShiftValue;
21491 SDValue RShOperand;
21492 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21493 return SDValue();
21494 SDValue Rshrnb = DAG.getNode(
21495 AArch64ISD::RSHRNB_I, DL, ResVT,
21496 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21497 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21498}
21499
21501 const AArch64Subtarget *Subtarget) {
21502 SDLoc DL(N);
21503 SDValue Op0 = N->getOperand(0);
21504 SDValue Op1 = N->getOperand(1);
21505 EVT ResVT = N->getValueType(0);
21506
21507 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
21508 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21510 Op0.getOperand(0) == Op1.getOperand(0)) {
21511
21512 SDValue SourceVec = Op0.getOperand(0);
21513 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
21514 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
21515 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
21516 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
21517 EVT OpVT = Op0.getOperand(1).getValueType();
21518 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21519 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
21520 DAG.getUNDEF(WidenedResVT));
21521 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
21522 DAG.getConstant(0, DL, OpVT));
21523 }
21524 }
21525
21526 // Following optimizations only work with uzp1.
21527 if (N->getOpcode() == AArch64ISD::UZP2)
21528 return SDValue();
21529
21530 // uzp1(x, undef) -> concat(truncate(x), undef)
21531 if (Op1.getOpcode() == ISD::UNDEF) {
21532 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21533 switch (ResVT.getSimpleVT().SimpleTy) {
21534 default:
21535 break;
21536 case MVT::v16i8:
21537 BCVT = MVT::v8i16;
21538 HalfVT = MVT::v8i8;
21539 break;
21540 case MVT::v8i16:
21541 BCVT = MVT::v4i32;
21542 HalfVT = MVT::v4i16;
21543 break;
21544 case MVT::v4i32:
21545 BCVT = MVT::v2i64;
21546 HalfVT = MVT::v2i32;
21547 break;
21548 }
21549 if (BCVT != MVT::Other) {
21550 SDValue BC = DAG.getBitcast(BCVT, Op0);
21551 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21552 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21553 DAG.getUNDEF(HalfVT));
21554 }
21555 }
21556
21557 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21558 return Urshr;
21559
21560 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21561 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21562
21563 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21564 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21565
21566 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21567 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21568 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21569 SDValue X = Op0.getOperand(0).getOperand(0);
21570 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21571 }
21572 }
21573
21574 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21575 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21576 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21577 SDValue Z = Op1.getOperand(0).getOperand(1);
21578 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21579 }
21580 }
21581
21582 // These optimizations only work on little endian.
21583 if (!DAG.getDataLayout().isLittleEndian())
21584 return SDValue();
21585
21586 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21587 // Example:
21588 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21589 // to
21590 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21592 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21593 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21594 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21595 Op1.getOperand(0));
21596 }
21597 }
21598
21599 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21600 return SDValue();
21601
21602 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21603 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21604
21605 // truncating uzp1(x, y) -> xtn(concat (x, y))
21606 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21607 EVT Op0Ty = SourceOp0.getValueType();
21608 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21609 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21610 SDValue Concat =
21613 SourceOp0, SourceOp1);
21614 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21615 }
21616 }
21617
21618 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21619 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21620 SourceOp1.getOpcode() != ISD::TRUNCATE)
21621 return SDValue();
21622 SourceOp0 = SourceOp0.getOperand(0);
21623 SourceOp1 = SourceOp1.getOperand(0);
21624
21625 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21626 !SourceOp0.getValueType().isSimple())
21627 return SDValue();
21628
21629 EVT ResultTy;
21630
21631 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21632 case MVT::v2i64:
21633 ResultTy = MVT::v4i32;
21634 break;
21635 case MVT::v4i32:
21636 ResultTy = MVT::v8i16;
21637 break;
21638 case MVT::v8i16:
21639 ResultTy = MVT::v16i8;
21640 break;
21641 default:
21642 return SDValue();
21643 }
21644
21645 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21646 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21647 SDValue UzpResult =
21648 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21649
21650 EVT BitcastResultTy;
21651
21652 switch (ResVT.getSimpleVT().SimpleTy) {
21653 case MVT::v2i32:
21654 BitcastResultTy = MVT::v2i64;
21655 break;
21656 case MVT::v4i16:
21657 BitcastResultTy = MVT::v4i32;
21658 break;
21659 case MVT::v8i8:
21660 BitcastResultTy = MVT::v8i16;
21661 break;
21662 default:
21663 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21664 }
21665
21666 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21667 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21668}
21669
21671 unsigned Opc = N->getOpcode();
21672
21673 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21675 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21677 "Invalid opcode.");
21678
21679 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21681 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21683 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21687
21688 SDLoc DL(N);
21689 SDValue Chain = N->getOperand(0);
21690 SDValue Pg = N->getOperand(1);
21691 SDValue Base = N->getOperand(2);
21692 SDValue Offset = N->getOperand(3);
21693 SDValue Ty = N->getOperand(4);
21694
21695 EVT ResVT = N->getValueType(0);
21696
21697 const auto OffsetOpc = Offset.getOpcode();
21698 const bool OffsetIsZExt =
21700 const bool OffsetIsSExt =
21702
21703 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21704 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21705 SDValue ExtPg = Offset.getOperand(0);
21706 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21707 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21708
21709 // If the predicate for the sign- or zero-extended offset is the
21710 // same as the predicate used for this load and the sign-/zero-extension
21711 // was from a 32-bits...
21712 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21713 SDValue UnextendedOffset = Offset.getOperand(1);
21714
21715 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21716 if (Signed)
21717 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21718
21719 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21720 {Chain, Pg, Base, UnextendedOffset, Ty});
21721 }
21722 }
21723
21724 return SDValue();
21725}
21726
21727/// Optimize a vector shift instruction and its operand if shifted out
21728/// bits are not used.
21730 const AArch64TargetLowering &TLI,
21732 assert(N->getOpcode() == AArch64ISD::VASHR ||
21733 N->getOpcode() == AArch64ISD::VLSHR);
21734
21735 SDValue Op = N->getOperand(0);
21736 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21737
21738 unsigned ShiftImm = N->getConstantOperandVal(1);
21739 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21740
21741 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21742 if (N->getOpcode() == AArch64ISD::VASHR &&
21743 Op.getOpcode() == AArch64ISD::VSHL &&
21744 N->getOperand(1) == Op.getOperand(1))
21745 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21746 return Op.getOperand(0);
21747
21748 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21749 APInt DemandedMask = ~ShiftedOutBits;
21750
21751 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21752 return SDValue(N, 0);
21753
21754 return SDValue();
21755}
21756
21758 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21759 // This transform works in partnership with performSetCCPunpkCombine to
21760 // remove unnecessary transfer of predicates into standard registers and back
21761 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21762 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21763 MVT::i1) {
21764 SDValue CC = N->getOperand(0)->getOperand(0);
21765 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21766 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21767 DAG.getVectorIdxConstant(0, SDLoc(N)));
21768 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21769 }
21770
21771 return SDValue();
21772}
21773
21774/// Target-specific DAG combine function for post-increment LD1 (lane) and
21775/// post-increment LD1R.
21778 bool IsLaneOp) {
21779 if (DCI.isBeforeLegalizeOps())
21780 return SDValue();
21781
21782 SelectionDAG &DAG = DCI.DAG;
21783 EVT VT = N->getValueType(0);
21784
21785 if (!VT.is128BitVector() && !VT.is64BitVector())
21786 return SDValue();
21787
21788 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21789 SDNode *LD = N->getOperand(LoadIdx).getNode();
21790 // If it is not LOAD, can not do such combine.
21791 if (LD->getOpcode() != ISD::LOAD)
21792 return SDValue();
21793
21794 // The vector lane must be a constant in the LD1LANE opcode.
21795 SDValue Lane;
21796 if (IsLaneOp) {
21797 Lane = N->getOperand(2);
21798 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21799 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21800 return SDValue();
21801 }
21802
21803 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21804 EVT MemVT = LoadSDN->getMemoryVT();
21805 // Check if memory operand is the same type as the vector element.
21806 if (MemVT != VT.getVectorElementType())
21807 return SDValue();
21808
21809 // Check if there are other uses. If so, do not combine as it will introduce
21810 // an extra load.
21811 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21812 ++UI) {
21813 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21814 continue;
21815 if (*UI != N)
21816 return SDValue();
21817 }
21818
21819 // If there is one use and it can splat the value, prefer that operation.
21820 // TODO: This could be expanded to more operations if they reliably use the
21821 // index variants.
21822 if (N->hasOneUse()) {
21823 unsigned UseOpc = N->use_begin()->getOpcode();
21824 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21825 return SDValue();
21826 }
21827
21828 SDValue Addr = LD->getOperand(1);
21829 SDValue Vector = N->getOperand(0);
21830 // Search for a use of the address operand that is an increment.
21831 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21832 Addr.getNode()->use_end(); UI != UE; ++UI) {
21833 SDNode *User = *UI;
21834 if (User->getOpcode() != ISD::ADD
21835 || UI.getUse().getResNo() != Addr.getResNo())
21836 continue;
21837
21838 // If the increment is a constant, it must match the memory ref size.
21839 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21840 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21841 uint32_t IncVal = CInc->getZExtValue();
21842 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21843 if (IncVal != NumBytes)
21844 continue;
21845 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21846 }
21847
21848 // To avoid cycle construction make sure that neither the load nor the add
21849 // are predecessors to each other or the Vector.
21852 Visited.insert(Addr.getNode());
21853 Worklist.push_back(User);
21854 Worklist.push_back(LD);
21855 Worklist.push_back(Vector.getNode());
21856 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21857 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21858 continue;
21859
21861 Ops.push_back(LD->getOperand(0)); // Chain
21862 if (IsLaneOp) {
21863 Ops.push_back(Vector); // The vector to be inserted
21864 Ops.push_back(Lane); // The lane to be inserted in the vector
21865 }
21866 Ops.push_back(Addr);
21867 Ops.push_back(Inc);
21868
21869 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21870 SDVTList SDTys = DAG.getVTList(Tys);
21871 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21872 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21873 MemVT,
21874 LoadSDN->getMemOperand());
21875
21876 // Update the uses.
21877 SDValue NewResults[] = {
21878 SDValue(LD, 0), // The result of load
21879 SDValue(UpdN.getNode(), 2) // Chain
21880 };
21881 DCI.CombineTo(LD, NewResults);
21882 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21883 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21884
21885 break;
21886 }
21887 return SDValue();
21888}
21889
21890/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21891/// address translation.
21894 SelectionDAG &DAG) {
21895 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21896 KnownBits Known;
21898 !DCI.isBeforeLegalizeOps());
21899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21900 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21901 DCI.CommitTargetLoweringOpt(TLO);
21902 return true;
21903 }
21904 return false;
21905}
21906
21908 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21909 "Expected STORE dag node in input!");
21910
21911 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21912 if (!Store->isTruncatingStore() || Store->isIndexed())
21913 return SDValue();
21914 SDValue Ext = Store->getValue();
21915 auto ExtOpCode = Ext.getOpcode();
21916 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21917 ExtOpCode != ISD::ANY_EXTEND)
21918 return SDValue();
21919 SDValue Orig = Ext->getOperand(0);
21920 if (Store->getMemoryVT() != Orig.getValueType())
21921 return SDValue();
21922 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21923 Store->getBasePtr(), Store->getMemOperand());
21924 }
21925
21926 return SDValue();
21927}
21928
21929// A custom combine to lower load <3 x i8> as the more efficient sequence
21930// below:
21931// ldrb wX, [x0, #2]
21932// ldrh wY, [x0]
21933// orr wX, wY, wX, lsl #16
21934// fmov s0, wX
21935//
21936// Note that an alternative sequence with even fewer (although usually more
21937// complex/expensive) instructions would be:
21938// ld1r.4h { v0 }, [x0], #2
21939// ld1.b { v0 }[2], [x0]
21940//
21941// Generating this sequence unfortunately results in noticeably worse codegen
21942// for code that extends the loaded v3i8, due to legalization breaking vector
21943// shuffle detection in a way that is very difficult to work around.
21944// TODO: Revisit once v3i8 legalization has been improved in general.
21946 EVT MemVT = LD->getMemoryVT();
21947 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21948 LD->getOriginalAlign() >= 4)
21949 return SDValue();
21950
21951 SDLoc DL(LD);
21953 SDValue Chain = LD->getChain();
21954 SDValue BasePtr = LD->getBasePtr();
21955 MachineMemOperand *MMO = LD->getMemOperand();
21956 assert(LD->getOffset().isUndef() && "undef offset expected");
21957
21958 // Load 2 x i8, then 1 x i8.
21959 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21960 TypeSize Offset2 = TypeSize::getFixed(2);
21961 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21962 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21963 MF.getMachineMemOperand(MMO, 2, 1));
21964
21965 // Extend to i32.
21966 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21967 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21968
21969 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21970 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21971 DAG.getConstant(16, DL, MVT::i32));
21972 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21973 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21974
21975 // Extract v3i8 again.
21976 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21977 DAG.getConstant(0, DL, MVT::i64));
21978 SDValue TokenFactor = DAG.getNode(
21979 ISD::TokenFactor, DL, MVT::Other,
21980 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21981 return DAG.getMergeValues({Extract, TokenFactor}, DL);
21982}
21983
21984// Perform TBI simplification if supported by the target and try to break up
21985// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21986// load instructions can be selected.
21989 SelectionDAG &DAG,
21990 const AArch64Subtarget *Subtarget) {
21991 if (Subtarget->supportsAddressTopByteIgnored())
21992 performTBISimplification(N->getOperand(1), DCI, DAG);
21993
21994 LoadSDNode *LD = cast<LoadSDNode>(N);
21995 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21996 return SDValue(N, 0);
21997
21998 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21999 return Res;
22000
22001 if (!LD->isNonTemporal())
22002 return SDValue(N, 0);
22003
22004 EVT MemVT = LD->getMemoryVT();
22005 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
22006 MemVT.getSizeInBits() % 256 == 0 ||
22007 256 % MemVT.getScalarSizeInBits() != 0)
22008 return SDValue(N, 0);
22009
22010 SDLoc DL(LD);
22011 SDValue Chain = LD->getChain();
22012 SDValue BasePtr = LD->getBasePtr();
22013 SDNodeFlags Flags = LD->getFlags();
22015 SmallVector<SDValue, 4> LoadOpsChain;
22016 // Replace any non temporal load over 256-bit with a series of 256 bit loads
22017 // and a scalar/vector load less than 256. This way we can utilize 256-bit
22018 // loads and reduce the amount of load instructions generated.
22019 MVT NewVT =
22021 256 / MemVT.getVectorElementType().getSizeInBits());
22022 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
22023 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
22024 for (unsigned I = 0; I < Num256Loads; I++) {
22025 unsigned PtrOffset = I * 32;
22026 SDValue NewPtr = DAG.getMemBasePlusOffset(
22027 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22028 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22029 SDValue NewLoad = DAG.getLoad(
22030 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
22031 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
22032 LoadOps.push_back(NewLoad);
22033 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
22034 }
22035
22036 // Process remaining bits of the load operation.
22037 // This is done by creating an UNDEF vector to match the size of the
22038 // 256-bit loads and inserting the remaining load to it. We extract the
22039 // original load type at the end using EXTRACT_SUBVECTOR instruction.
22040 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
22041 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
22042 MVT RemainingVT = MVT::getVectorVT(
22044 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22045 SDValue NewPtr = DAG.getMemBasePlusOffset(
22046 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22047 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22048 SDValue RemainingLoad =
22049 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
22050 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
22051 LD->getMemOperand()->getFlags(), LD->getAAInfo());
22052 SDValue UndefVector = DAG.getUNDEF(NewVT);
22053 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
22054 SDValue ExtendedReminingLoad =
22055 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
22056 {UndefVector, RemainingLoad, InsertIdx});
22057 LoadOps.push_back(ExtendedReminingLoad);
22058 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
22059 EVT ConcatVT =
22061 LoadOps.size() * NewVT.getVectorNumElements());
22062 SDValue ConcatVectors =
22063 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
22064 // Extract the original vector type size.
22065 SDValue ExtractSubVector =
22066 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
22067 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
22068 SDValue TokenFactor =
22069 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
22070 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
22071}
22072
22074 EVT VecVT = Op.getValueType();
22075 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22076 "Need boolean vector type.");
22077
22078 if (Depth > 3)
22080
22081 // We can get the base type from a vector compare or truncate.
22082 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22083 return Op.getOperand(0).getValueType();
22084
22085 // If an operand is a bool vector, continue looking.
22087 for (SDValue Operand : Op->op_values()) {
22088 if (Operand.getValueType() != VecVT)
22089 continue;
22090
22091 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22092 if (!BaseVT.isSimple())
22093 BaseVT = OperandVT;
22094 else if (OperandVT != BaseVT)
22096 }
22097
22098 return BaseVT;
22099}
22100
22101// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22102// iN, we can use a trick that extracts the i^th bit from the i^th element and
22103// then performs a vector add to get a scalar bitmask. This requires that each
22104// element's bits are either all 1 or all 0.
22106 SDLoc DL(N);
22107 SDValue ComparisonResult(N, 0);
22108 EVT VecVT = ComparisonResult.getValueType();
22109 assert(VecVT.isVector() && "Must be a vector type");
22110
22111 unsigned NumElts = VecVT.getVectorNumElements();
22112 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22113 return SDValue();
22114
22115 if (VecVT.getVectorElementType() != MVT::i1 &&
22116 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22117 return SDValue();
22118
22119 // If we can find the original types to work on instead of a vector of i1,
22120 // we can avoid extend/extract conversion instructions.
22121 if (VecVT.getVectorElementType() == MVT::i1) {
22122 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22123 if (!VecVT.isSimple()) {
22124 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22125 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22126 }
22127 }
22128 VecVT = VecVT.changeVectorElementTypeToInteger();
22129
22130 // Large vectors don't map directly to this conversion, so to avoid too many
22131 // edge cases, we don't apply it here. The conversion will likely still be
22132 // applied later via multiple smaller vectors, whose results are concatenated.
22133 if (VecVT.getSizeInBits() > 128)
22134 return SDValue();
22135
22136 // Ensure that all elements' bits are either 0s or 1s.
22137 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22138
22139 SmallVector<SDValue, 16> MaskConstants;
22140 if (VecVT == MVT::v16i8) {
22141 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22142 // per entry. We split it into two halves, apply the mask, zip the halves to
22143 // create 8x 16-bit values, and the perform the vector reduce.
22144 for (unsigned Half = 0; Half < 2; ++Half) {
22145 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22146 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22147 }
22148 }
22149 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22150 SDValue RepresentativeBits =
22151 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22152
22153 SDValue UpperRepresentativeBits =
22154 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22155 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22156 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22157 RepresentativeBits, UpperRepresentativeBits);
22158 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22159 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22160 }
22161
22162 // All other vector sizes.
22163 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22164 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22165 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22166 }
22167
22168 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22169 SDValue RepresentativeBits =
22170 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22171 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22172 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22173 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22174}
22175
22177 StoreSDNode *Store) {
22178 if (!Store->isTruncatingStore())
22179 return SDValue();
22180
22181 SDLoc DL(Store);
22182 SDValue VecOp = Store->getValue();
22183 EVT VT = VecOp.getValueType();
22184 EVT MemVT = Store->getMemoryVT();
22185
22186 if (!MemVT.isVector() || !VT.isVector() ||
22187 MemVT.getVectorElementType() != MVT::i1)
22188 return SDValue();
22189
22190 // If we are storing a vector that we are currently building, let
22191 // `scalarizeVectorStore()` handle this more efficiently.
22192 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22193 return SDValue();
22194
22195 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22196 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22197 if (!VectorBits)
22198 return SDValue();
22199
22200 EVT StoreVT =
22202 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22203 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22204 Store->getMemOperand());
22205}
22206
22208 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22209 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22210 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22211}
22212
22213// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22215 const AArch64Subtarget *Subtarget) {
22216 SDValue Value = ST->getValue();
22217 EVT ValueVT = Value.getValueType();
22218
22219 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22220 Value.getOpcode() != ISD::TRUNCATE ||
22221 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22222 return SDValue();
22223
22224 assert(ST->getOffset().isUndef() && "undef offset expected");
22225 SDLoc DL(ST);
22226 auto WideVT = EVT::getVectorVT(
22227 *DAG.getContext(),
22228 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22229 SDValue UndefVector = DAG.getUNDEF(WideVT);
22230 SDValue WideTrunc = DAG.getNode(
22231 ISD::INSERT_SUBVECTOR, DL, WideVT,
22232 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22233 SDValue Cast = DAG.getNode(
22234 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22235 WideTrunc);
22236
22238 SDValue Chain = ST->getChain();
22239 MachineMemOperand *MMO = ST->getMemOperand();
22240 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22241 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22242 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22243 TypeSize Offset2 = TypeSize::getFixed(2);
22244 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22245 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22246
22247 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22248 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22249 TypeSize Offset1 = TypeSize::getFixed(1);
22250 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22251 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22252
22253 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22254 DAG.getConstant(0, DL, MVT::i64));
22255 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22256 MF.getMachineMemOperand(MMO, 0, 1));
22257 return Chain;
22258}
22259
22262 SelectionDAG &DAG,
22263 const AArch64Subtarget *Subtarget) {
22264 StoreSDNode *ST = cast<StoreSDNode>(N);
22265 SDValue Chain = ST->getChain();
22266 SDValue Value = ST->getValue();
22267 SDValue Ptr = ST->getBasePtr();
22268 EVT ValueVT = Value.getValueType();
22269
22270 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22271 EVT EltVT = VT.getVectorElementType();
22272 return EltVT == MVT::f32 || EltVT == MVT::f64;
22273 };
22274
22275 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22276 return Res;
22277
22278 // If this is an FP_ROUND followed by a store, fold this into a truncating
22279 // store. We can do this even if this is already a truncstore.
22280 // We purposefully don't care about legality of the nodes here as we know
22281 // they can be split down into something legal.
22282 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22283 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22284 Subtarget->useSVEForFixedLengthVectors() &&
22285 ValueVT.isFixedLengthVector() &&
22286 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22287 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22288 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22289 ST->getMemoryVT(), ST->getMemOperand());
22290
22291 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22292 return Split;
22293
22294 if (Subtarget->supportsAddressTopByteIgnored() &&
22295 performTBISimplification(N->getOperand(2), DCI, DAG))
22296 return SDValue(N, 0);
22297
22298 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22299 return Store;
22300
22301 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22302 return Store;
22303
22304 if (ST->isTruncatingStore()) {
22305 EVT StoreVT = ST->getMemoryVT();
22306 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22307 return SDValue();
22308 if (SDValue Rshrnb =
22309 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22310 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22311 StoreVT, ST->getMemOperand());
22312 }
22313 }
22314
22315 return SDValue();
22316}
22317
22320 SelectionDAG &DAG,
22321 const AArch64Subtarget *Subtarget) {
22322 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22323 SDValue Value = MST->getValue();
22324 SDValue Mask = MST->getMask();
22325 SDLoc DL(N);
22326
22327 // If this is a UZP1 followed by a masked store, fold this into a masked
22328 // truncating store. We can do this even if this is already a masked
22329 // truncstore.
22330 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22331 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22332 Value.getValueType().isInteger()) {
22333 Value = Value.getOperand(0);
22334 if (Value.getOpcode() == ISD::BITCAST) {
22335 EVT HalfVT =
22336 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22337 EVT InVT = Value.getOperand(0).getValueType();
22338
22339 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22340 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22341 unsigned PgPattern = Mask->getConstantOperandVal(0);
22342
22343 // Ensure we can double the size of the predicate pattern
22344 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22345 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22346 MinSVESize) {
22347 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22348 PgPattern);
22349 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22350 MST->getBasePtr(), MST->getOffset(), Mask,
22351 MST->getMemoryVT(), MST->getMemOperand(),
22352 MST->getAddressingMode(),
22353 /*IsTruncating=*/true);
22354 }
22355 }
22356 }
22357 }
22358
22359 if (MST->isTruncatingStore()) {
22360 EVT ValueVT = Value->getValueType(0);
22361 EVT MemVT = MST->getMemoryVT();
22362 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22363 return SDValue();
22364 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22365 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22366 MST->getOffset(), MST->getMask(),
22367 MST->getMemoryVT(), MST->getMemOperand(),
22368 MST->getAddressingMode(), true);
22369 }
22370 }
22371
22372 return SDValue();
22373}
22374
22375/// \return true if part of the index was folded into the Base.
22376static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22377 SDLoc DL, SelectionDAG &DAG) {
22378 // This function assumes a vector of i64 indices.
22379 EVT IndexVT = Index.getValueType();
22380 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22381 return false;
22382
22383 // Simplify:
22384 // BasePtr = Ptr
22385 // Index = X + splat(Offset)
22386 // ->
22387 // BasePtr = Ptr + Offset * scale.
22388 // Index = X
22389 if (Index.getOpcode() == ISD::ADD) {
22390 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22391 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22392 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22393 Index = Index.getOperand(0);
22394 return true;
22395 }
22396 }
22397
22398 // Simplify:
22399 // BasePtr = Ptr
22400 // Index = (X + splat(Offset)) << splat(Shift)
22401 // ->
22402 // BasePtr = Ptr + (Offset << Shift) * scale)
22403 // Index = X << splat(shift)
22404 if (Index.getOpcode() == ISD::SHL &&
22405 Index.getOperand(0).getOpcode() == ISD::ADD) {
22406 SDValue Add = Index.getOperand(0);
22407 SDValue ShiftOp = Index.getOperand(1);
22408 SDValue OffsetOp = Add.getOperand(1);
22409 if (auto Shift = DAG.getSplatValue(ShiftOp))
22410 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22411 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22412 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22413 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22414 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22415 Add.getOperand(0), ShiftOp);
22416 return true;
22417 }
22418 }
22419
22420 return false;
22421}
22422
22423// Analyse the specified address returning true if a more optimal addressing
22424// mode is available. When returning true all parameters are updated to reflect
22425// their recommended values.
22427 SDValue &BasePtr, SDValue &Index,
22428 SelectionDAG &DAG) {
22429 // Try to iteratively fold parts of the index into the base pointer to
22430 // simplify the index as much as possible.
22431 bool Changed = false;
22432 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22433 Changed = true;
22434
22435 // Only consider element types that are pointer sized as smaller types can
22436 // be easily promoted.
22437 EVT IndexVT = Index.getValueType();
22438 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22439 return Changed;
22440
22441 // Can indices be trivially shrunk?
22442 EVT DataVT = N->getOperand(1).getValueType();
22443 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22444 // will later be re-extended to 64 bits in legalization
22445 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22446 return Changed;
22447 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22448 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22449 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22450 return true;
22451 }
22452
22453 // Match:
22454 // Index = step(const)
22455 int64_t Stride = 0;
22456 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22457 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22458 }
22459 // Match:
22460 // Index = step(const) << shift(const)
22461 else if (Index.getOpcode() == ISD::SHL &&
22462 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22463 SDValue RHS = Index.getOperand(1);
22464 if (auto *Shift =
22465 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22466 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22467 Stride = Step << Shift->getZExtValue();
22468 }
22469 }
22470
22471 // Return early because no supported pattern is found.
22472 if (Stride == 0)
22473 return Changed;
22474
22475 if (Stride < std::numeric_limits<int32_t>::min() ||
22476 Stride > std::numeric_limits<int32_t>::max())
22477 return Changed;
22478
22479 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22480 unsigned MaxVScale =
22482 int64_t LastElementOffset =
22483 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22484
22485 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22486 LastElementOffset > std::numeric_limits<int32_t>::max())
22487 return Changed;
22488
22489 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22490 // Stride does not scale explicitly by 'Scale', because it happens in
22491 // the gather/scatter addressing mode.
22492 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22493 return true;
22494}
22495
22498 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22499 assert(MGS && "Can only combine gather load or scatter store nodes");
22500
22501 if (!DCI.isBeforeLegalize())
22502 return SDValue();
22503
22504 SDLoc DL(MGS);
22505 SDValue Chain = MGS->getChain();
22506 SDValue Scale = MGS->getScale();
22507 SDValue Index = MGS->getIndex();
22508 SDValue Mask = MGS->getMask();
22509 SDValue BasePtr = MGS->getBasePtr();
22510 ISD::MemIndexType IndexType = MGS->getIndexType();
22511
22512 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22513 return SDValue();
22514
22515 // Here we catch such cases early and change MGATHER's IndexType to allow
22516 // the use of an Index that's more legalisation friendly.
22517 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22518 SDValue PassThru = MGT->getPassThru();
22519 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22520 return DAG.getMaskedGather(
22521 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22522 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22523 }
22524 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22525 SDValue Data = MSC->getValue();
22526 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22527 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22528 Ops, MSC->getMemOperand(), IndexType,
22529 MSC->isTruncatingStore());
22530}
22531
22532/// Target-specific DAG combine function for NEON load/store intrinsics
22533/// to merge base address updates.
22536 SelectionDAG &DAG) {
22537 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22538 return SDValue();
22539
22540 unsigned AddrOpIdx = N->getNumOperands() - 1;
22541 SDValue Addr = N->getOperand(AddrOpIdx);
22542
22543 // Search for a use of the address operand that is an increment.
22544 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22545 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22546 SDNode *User = *UI;
22547 if (User->getOpcode() != ISD::ADD ||
22548 UI.getUse().getResNo() != Addr.getResNo())
22549 continue;
22550
22551 // Check that the add is independent of the load/store. Otherwise, folding
22552 // it would create a cycle.
22555 Visited.insert(Addr.getNode());
22556 Worklist.push_back(N);
22557 Worklist.push_back(User);
22558 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22559 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22560 continue;
22561
22562 // Find the new opcode for the updating load/store.
22563 bool IsStore = false;
22564 bool IsLaneOp = false;
22565 bool IsDupOp = false;
22566 unsigned NewOpc = 0;
22567 unsigned NumVecs = 0;
22568 unsigned IntNo = N->getConstantOperandVal(1);
22569 switch (IntNo) {
22570 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22571 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22572 NumVecs = 2; break;
22573 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22574 NumVecs = 3; break;
22575 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22576 NumVecs = 4; break;
22577 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22578 NumVecs = 2; IsStore = true; break;
22579 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22580 NumVecs = 3; IsStore = true; break;
22581 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22582 NumVecs = 4; IsStore = true; break;
22583 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22584 NumVecs = 2; break;
22585 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22586 NumVecs = 3; break;
22587 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22588 NumVecs = 4; break;
22589 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22590 NumVecs = 2; IsStore = true; break;
22591 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22592 NumVecs = 3; IsStore = true; break;
22593 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22594 NumVecs = 4; IsStore = true; break;
22595 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22596 NumVecs = 2; IsDupOp = true; break;
22597 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22598 NumVecs = 3; IsDupOp = true; break;
22599 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22600 NumVecs = 4; IsDupOp = true; break;
22601 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22602 NumVecs = 2; IsLaneOp = true; break;
22603 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22604 NumVecs = 3; IsLaneOp = true; break;
22605 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22606 NumVecs = 4; IsLaneOp = true; break;
22607 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22608 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22609 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22610 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22611 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22612 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22613 }
22614
22615 EVT VecTy;
22616 if (IsStore)
22617 VecTy = N->getOperand(2).getValueType();
22618 else
22619 VecTy = N->getValueType(0);
22620
22621 // If the increment is a constant, it must match the memory ref size.
22622 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22623 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22624 uint32_t IncVal = CInc->getZExtValue();
22625 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22626 if (IsLaneOp || IsDupOp)
22627 NumBytes /= VecTy.getVectorNumElements();
22628 if (IncVal != NumBytes)
22629 continue;
22630 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22631 }
22633 Ops.push_back(N->getOperand(0)); // Incoming chain
22634 // Load lane and store have vector list as input.
22635 if (IsLaneOp || IsStore)
22636 for (unsigned i = 2; i < AddrOpIdx; ++i)
22637 Ops.push_back(N->getOperand(i));
22638 Ops.push_back(Addr); // Base register
22639 Ops.push_back(Inc);
22640
22641 // Return Types.
22642 EVT Tys[6];
22643 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22644 unsigned n;
22645 for (n = 0; n < NumResultVecs; ++n)
22646 Tys[n] = VecTy;
22647 Tys[n++] = MVT::i64; // Type of write back register
22648 Tys[n] = MVT::Other; // Type of the chain
22649 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22650
22651 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22652 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22653 MemInt->getMemoryVT(),
22654 MemInt->getMemOperand());
22655
22656 // Update the uses.
22657 std::vector<SDValue> NewResults;
22658 for (unsigned i = 0; i < NumResultVecs; ++i) {
22659 NewResults.push_back(SDValue(UpdN.getNode(), i));
22660 }
22661 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22662 DCI.CombineTo(N, NewResults);
22663 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22664
22665 break;
22666 }
22667 return SDValue();
22668}
22669
22670// Checks to see if the value is the prescribed width and returns information
22671// about its extension mode.
22672static
22673bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22674 ExtType = ISD::NON_EXTLOAD;
22675 switch(V.getNode()->getOpcode()) {
22676 default:
22677 return false;
22678 case ISD::LOAD: {
22679 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22680 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22681 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22682 ExtType = LoadNode->getExtensionType();
22683 return true;
22684 }
22685 return false;
22686 }
22687 case ISD::AssertSext: {
22688 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22689 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22690 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22691 ExtType = ISD::SEXTLOAD;
22692 return true;
22693 }
22694 return false;
22695 }
22696 case ISD::AssertZext: {
22697 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22698 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22699 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22700 ExtType = ISD::ZEXTLOAD;
22701 return true;
22702 }
22703 return false;
22704 }
22705 case ISD::Constant:
22706 case ISD::TargetConstant: {
22707 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22708 1LL << (width - 1);
22709 }
22710 }
22711
22712 return true;
22713}
22714
22715// This function does a whole lot of voodoo to determine if the tests are
22716// equivalent without and with a mask. Essentially what happens is that given a
22717// DAG resembling:
22718//
22719// +-------------+ +-------------+ +-------------+ +-------------+
22720// | Input | | AddConstant | | CompConstant| | CC |
22721// +-------------+ +-------------+ +-------------+ +-------------+
22722// | | | |
22723// V V | +----------+
22724// +-------------+ +----+ | |
22725// | ADD | |0xff| | |
22726// +-------------+ +----+ | |
22727// | | | |
22728// V V | |
22729// +-------------+ | |
22730// | AND | | |
22731// +-------------+ | |
22732// | | |
22733// +-----+ | |
22734// | | |
22735// V V V
22736// +-------------+
22737// | CMP |
22738// +-------------+
22739//
22740// The AND node may be safely removed for some combinations of inputs. In
22741// particular we need to take into account the extension type of the Input,
22742// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22743// width of the input (this can work for any width inputs, the above graph is
22744// specific to 8 bits.
22745//
22746// The specific equations were worked out by generating output tables for each
22747// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22748// problem was simplified by working with 4 bit inputs, which means we only
22749// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22750// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22751// patterns present in both extensions (0,7). For every distinct set of
22752// AddConstant and CompConstants bit patterns we can consider the masked and
22753// unmasked versions to be equivalent if the result of this function is true for
22754// all 16 distinct bit patterns of for the current extension type of Input (w0).
22755//
22756// sub w8, w0, w1
22757// and w10, w8, #0x0f
22758// cmp w8, w2
22759// cset w9, AArch64CC
22760// cmp w10, w2
22761// cset w11, AArch64CC
22762// cmp w9, w11
22763// cset w0, eq
22764// ret
22765//
22766// Since the above function shows when the outputs are equivalent it defines
22767// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22768// would be expensive to run during compiles. The equations below were written
22769// in a test harness that confirmed they gave equivalent outputs to the above
22770// for all inputs function, so they can be used determine if the removal is
22771// legal instead.
22772//
22773// isEquivalentMaskless() is the code for testing if the AND can be removed
22774// factored out of the DAG recognition as the DAG can take several forms.
22775
22776static bool isEquivalentMaskless(unsigned CC, unsigned width,
22777 ISD::LoadExtType ExtType, int AddConstant,
22778 int CompConstant) {
22779 // By being careful about our equations and only writing the in term
22780 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22781 // make them generally applicable to all bit widths.
22782 int MaxUInt = (1 << width);
22783
22784 // For the purposes of these comparisons sign extending the type is
22785 // equivalent to zero extending the add and displacing it by half the integer
22786 // width. Provided we are careful and make sure our equations are valid over
22787 // the whole range we can just adjust the input and avoid writing equations
22788 // for sign extended inputs.
22789 if (ExtType == ISD::SEXTLOAD)
22790 AddConstant -= (1 << (width-1));
22791
22792 switch(CC) {
22793 case AArch64CC::LE:
22794 case AArch64CC::GT:
22795 if ((AddConstant == 0) ||
22796 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22797 (AddConstant >= 0 && CompConstant < 0) ||
22798 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22799 return true;
22800 break;
22801 case AArch64CC::LT:
22802 case AArch64CC::GE:
22803 if ((AddConstant == 0) ||
22804 (AddConstant >= 0 && CompConstant <= 0) ||
22805 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22806 return true;
22807 break;
22808 case AArch64CC::HI:
22809 case AArch64CC::LS:
22810 if ((AddConstant >= 0 && CompConstant < 0) ||
22811 (AddConstant <= 0 && CompConstant >= -1 &&
22812 CompConstant < AddConstant + MaxUInt))
22813 return true;
22814 break;
22815 case AArch64CC::PL:
22816 case AArch64CC::MI:
22817 if ((AddConstant == 0) ||
22818 (AddConstant > 0 && CompConstant <= 0) ||
22819 (AddConstant < 0 && CompConstant <= AddConstant))
22820 return true;
22821 break;
22822 case AArch64CC::LO:
22823 case AArch64CC::HS:
22824 if ((AddConstant >= 0 && CompConstant <= 0) ||
22825 (AddConstant <= 0 && CompConstant >= 0 &&
22826 CompConstant <= AddConstant + MaxUInt))
22827 return true;
22828 break;
22829 case AArch64CC::EQ:
22830 case AArch64CC::NE:
22831 if ((AddConstant > 0 && CompConstant < 0) ||
22832 (AddConstant < 0 && CompConstant >= 0 &&
22833 CompConstant < AddConstant + MaxUInt) ||
22834 (AddConstant >= 0 && CompConstant >= 0 &&
22835 CompConstant >= AddConstant) ||
22836 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22837 return true;
22838 break;
22839 case AArch64CC::VS:
22840 case AArch64CC::VC:
22841 case AArch64CC::AL:
22842 case AArch64CC::NV:
22843 return true;
22844 case AArch64CC::Invalid:
22845 break;
22846 }
22847
22848 return false;
22849}
22850
22851// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22852// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22854 SDNode *AndNode, SelectionDAG &DAG,
22855 unsigned CCIndex, unsigned CmpIndex,
22856 unsigned CC) {
22857 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22858 if (!SubsC)
22859 return SDValue();
22860
22861 APInt SubsAP = SubsC->getAPIntValue();
22862 if (CC == AArch64CC::HI) {
22863 if (!SubsAP.isMask())
22864 return SDValue();
22865 } else if (CC == AArch64CC::LO) {
22866 if (!SubsAP.isPowerOf2())
22867 return SDValue();
22868 } else
22869 return SDValue();
22870
22871 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22872 if (!AndC)
22873 return SDValue();
22874
22875 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22876
22877 SDLoc DL(N);
22878 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22879 SDValue ANDS = DAG.getNode(
22880 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22881 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22882 SDValue AArch64_CC =
22884 N->getOperand(CCIndex)->getValueType(0));
22885
22886 // For now, only performCSELCombine and performBRCONDCombine call this
22887 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22888 // operands. So just init the ops direct to simplify the code. If we have some
22889 // other case with different CCIndex, CmpIndex, we need to use for loop to
22890 // rewrite the code here.
22891 // TODO: Do we need to assert number of operand is 4 here?
22892 assert((CCIndex == 2 && CmpIndex == 3) &&
22893 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22894 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22895 ANDS.getValue(1)};
22896 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22897}
22898
22899static
22902 SelectionDAG &DAG, unsigned CCIndex,
22903 unsigned CmpIndex) {
22904 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22905 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22906 unsigned CondOpcode = SubsNode->getOpcode();
22907
22908 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
22909 !SubsNode->hasOneUse())
22910 return SDValue();
22911
22912 // There is a SUBS feeding this condition. Is it fed by a mask we can
22913 // use?
22914
22915 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22916 unsigned MaskBits = 0;
22917
22918 if (AndNode->getOpcode() != ISD::AND)
22919 return SDValue();
22920
22921 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22922 CmpIndex, CC))
22923 return Val;
22924
22925 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22926 uint32_t CNV = CN->getZExtValue();
22927 if (CNV == 255)
22928 MaskBits = 8;
22929 else if (CNV == 65535)
22930 MaskBits = 16;
22931 }
22932
22933 if (!MaskBits)
22934 return SDValue();
22935
22936 SDValue AddValue = AndNode->getOperand(0);
22937
22938 if (AddValue.getOpcode() != ISD::ADD)
22939 return SDValue();
22940
22941 // The basic dag structure is correct, grab the inputs and validate them.
22942
22943 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22944 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22945 SDValue SubsInputValue = SubsNode->getOperand(1);
22946
22947 // The mask is present and the provenance of all the values is a smaller type,
22948 // lets see if the mask is superfluous.
22949
22950 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22951 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22952 return SDValue();
22953
22954 ISD::LoadExtType ExtType;
22955
22956 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22957 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22958 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22959 return SDValue();
22960
22961 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22962 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22963 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22964 return SDValue();
22965
22966 // The AND is not necessary, remove it.
22967
22968 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22969 SubsNode->getValueType(1));
22970 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22971
22972 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22973 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22974
22975 return SDValue(N, 0);
22976}
22977
22978// Optimize compare with zero and branch.
22981 SelectionDAG &DAG) {
22983 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22984 // will not be produced, as they are conditional branch instructions that do
22985 // not set flags.
22986 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22987 return SDValue();
22988
22989 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22990 N = NV.getNode();
22991 SDValue Chain = N->getOperand(0);
22992 SDValue Dest = N->getOperand(1);
22993 SDValue CCVal = N->getOperand(2);
22994 SDValue Cmp = N->getOperand(3);
22995
22996 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22997 unsigned CC = CCVal->getAsZExtVal();
22998 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22999 return SDValue();
23000
23001 unsigned CmpOpc = Cmp.getOpcode();
23002 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23003 return SDValue();
23004
23005 // Only attempt folding if there is only one use of the flag and no use of the
23006 // value.
23007 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
23008 return SDValue();
23009
23010 SDValue LHS = Cmp.getOperand(0);
23011 SDValue RHS = Cmp.getOperand(1);
23012
23013 assert(LHS.getValueType() == RHS.getValueType() &&
23014 "Expected the value type to be the same for both operands!");
23015 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23016 return SDValue();
23017
23018 if (isNullConstant(LHS))
23019 std::swap(LHS, RHS);
23020
23021 if (!isNullConstant(RHS))
23022 return SDValue();
23023
23024 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
23025 LHS.getOpcode() == ISD::SRL)
23026 return SDValue();
23027
23028 // Fold the compare into the branch instruction.
23029 SDValue BR;
23030 if (CC == AArch64CC::EQ)
23031 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23032 else
23033 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23034
23035 // Do not add new nodes to DAG combiner worklist.
23036 DCI.CombineTo(N, BR, false);
23037
23038 return SDValue();
23039}
23040
23042 unsigned CC = N->getConstantOperandVal(2);
23043 SDValue SUBS = N->getOperand(3);
23044 SDValue Zero, CTTZ;
23045
23046 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23047 Zero = N->getOperand(0);
23048 CTTZ = N->getOperand(1);
23049 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23050 Zero = N->getOperand(1);
23051 CTTZ = N->getOperand(0);
23052 } else
23053 return SDValue();
23054
23055 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
23056 (CTTZ.getOpcode() == ISD::TRUNCATE &&
23057 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
23058 return SDValue();
23059
23060 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23061 "Illegal type in CTTZ folding");
23062
23063 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
23064 return SDValue();
23065
23066 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23067 ? CTTZ.getOperand(0).getOperand(0)
23068 : CTTZ.getOperand(0);
23069
23070 if (X != SUBS.getOperand(0))
23071 return SDValue();
23072
23073 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23074 ? CTTZ.getOperand(0).getValueSizeInBits()
23075 : CTTZ.getValueSizeInBits();
23076 SDValue BitWidthMinusOne =
23077 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23078 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23079 BitWidthMinusOne);
23080}
23081
23082// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23083// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23084// Where x and y are constants and x != y
23085
23086// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23087// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23088// Where x and y are constants and x != y
23090 SDValue L = Op->getOperand(0);
23091 SDValue R = Op->getOperand(1);
23092 AArch64CC::CondCode OpCC =
23093 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23094
23095 SDValue OpCmp = Op->getOperand(3);
23096 if (!isCMP(OpCmp))
23097 return SDValue();
23098
23099 SDValue CmpLHS = OpCmp.getOperand(0);
23100 SDValue CmpRHS = OpCmp.getOperand(1);
23101
23102 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23103 std::swap(CmpLHS, CmpRHS);
23104 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23105 return SDValue();
23106
23107 SDValue X = CmpLHS->getOperand(0);
23108 SDValue Y = CmpLHS->getOperand(1);
23109 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23110 return SDValue();
23111 }
23112
23113 // If one of the constant is opaque constant, x,y sdnode is still different
23114 // but the real value maybe the same. So check APInt here to make sure the
23115 // code is correct.
23116 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23117 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23118 if (CX->getAPIntValue() == CY->getAPIntValue())
23119 return SDValue();
23120
23122 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23123 SDValue Cond = CmpLHS->getOperand(3);
23124
23125 if (CmpRHS == Y)
23127 else if (CmpRHS != X)
23128 return SDValue();
23129
23130 if (OpCC == AArch64CC::NE)
23132 else if (OpCC != AArch64CC::EQ)
23133 return SDValue();
23134
23135 SDLoc DL(Op);
23136 EVT VT = Op->getValueType(0);
23137
23138 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23139 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23140}
23141
23142// Optimize CSEL instructions
23145 SelectionDAG &DAG) {
23146 // CSEL x, x, cc -> x
23147 if (N->getOperand(0) == N->getOperand(1))
23148 return N->getOperand(0);
23149
23150 if (SDValue R = foldCSELOfCSEL(N, DAG))
23151 return R;
23152
23153 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23154 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23155 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23156 return Folded;
23157
23158 return performCONDCombine(N, DCI, DAG, 2, 3);
23159}
23160
23161// Try to re-use an already extended operand of a vector SetCC feeding a
23162// extended select. Doing so avoids requiring another full extension of the
23163// SET_CC result when lowering the select.
23165 EVT Op0MVT = Op->getOperand(0).getValueType();
23166 if (!Op0MVT.isVector() || Op->use_empty())
23167 return SDValue();
23168
23169 // Make sure that all uses of Op are VSELECTs with result matching types where
23170 // the result type has a larger element type than the SetCC operand.
23171 SDNode *FirstUse = *Op->use_begin();
23172 if (FirstUse->getOpcode() != ISD::VSELECT)
23173 return SDValue();
23174 EVT UseMVT = FirstUse->getValueType(0);
23175 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23176 return SDValue();
23177 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23178 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23179 }))
23180 return SDValue();
23181
23182 APInt V;
23183 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23184 return SDValue();
23185
23186 SDLoc DL(Op);
23187 SDValue Op0ExtV;
23188 SDValue Op1ExtV;
23189 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23190 // Check if the first operand of the SET_CC is already extended. If it is,
23191 // split the SET_CC and re-use the extended version of the operand.
23192 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23193 Op->getOperand(0));
23194 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23195 Op->getOperand(0));
23196 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23197 Op0ExtV = SDValue(Op0SExt, 0);
23198 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23199 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23200 Op0ExtV = SDValue(Op0ZExt, 0);
23201 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23202 } else
23203 return SDValue();
23204
23205 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23206 Op0ExtV, Op1ExtV, Op->getOperand(2));
23207}
23208
23209static SDValue
23211 SelectionDAG &DAG) {
23212 SDValue Vec = N->getOperand(0);
23213 if (DCI.isBeforeLegalize() &&
23214 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23217 SDLoc DL(N);
23218 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23219 DAG);
23220 }
23221
23222 return SDValue();
23223}
23224
23227 SelectionDAG &DAG) {
23228 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23229 SDValue LHS = N->getOperand(0);
23230 SDValue RHS = N->getOperand(1);
23231 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23232 SDLoc DL(N);
23233 EVT VT = N->getValueType(0);
23234
23235 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23236 return V;
23237
23238 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23239 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23240 LHS->getOpcode() == AArch64ISD::CSEL &&
23241 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23242 LHS->hasOneUse()) {
23243 // Invert CSEL's condition.
23244 auto OldCond =
23245 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23246 auto NewCond = getInvertedCondCode(OldCond);
23247
23248 // csel 0, 1, !cond, X
23249 SDValue CSEL =
23250 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23251 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23252 LHS.getOperand(3));
23253 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23254 }
23255
23256 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23257 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23258 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23259 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23260 LHS->hasOneUse()) {
23261 EVT TstVT = LHS->getValueType(0);
23262 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23263 // this pattern will get better opt in emitComparison
23264 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23265 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23266 DAG.getConstant(TstImm, DL, TstVT));
23267 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23268 }
23269 }
23270
23271 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23272 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23273 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23274 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23275 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23276 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23278 LHS->getOpcode() == ISD::BITCAST) {
23279 EVT ToVT = LHS->getValueType(0);
23280 EVT FromVT = LHS->getOperand(0).getValueType();
23281 if (FromVT.isFixedLengthVector() &&
23282 FromVT.getVectorElementType() == MVT::i1) {
23283 bool IsNull = isNullConstant(RHS);
23285 DL, MVT::i1, LHS->getOperand(0));
23286 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23287 LHS);
23288 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23289 }
23290 }
23291
23292 // Try to perform the memcmp when the result is tested for [in]equality with 0
23293 if (SDValue V = performOrXorChainCombine(N, DAG))
23294 return V;
23295
23296 return SDValue();
23297}
23298
23299// Replace a flag-setting operator (eg ANDS) with the generic version
23300// (eg AND) if the flag is unused.
23303 unsigned GenericOpcode) {
23304 SDLoc DL(N);
23305 SDValue LHS = N->getOperand(0);
23306 SDValue RHS = N->getOperand(1);
23307 EVT VT = N->getValueType(0);
23308
23309 // If the flag result isn't used, convert back to a generic opcode.
23310 if (!N->hasAnyUseOfValue(1)) {
23311 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23312 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23313 DL);
23314 }
23315
23316 // Combine identical generic nodes into this node, re-using the result.
23317 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23318 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23319 DCI.CombineTo(Generic, SDValue(N, 0));
23320
23321 return SDValue();
23322}
23323
23325 // setcc_merge_zero pred
23326 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23327 // => extract_subvector (inner setcc_merge_zero)
23328 SDValue Pred = N->getOperand(0);
23329 SDValue LHS = N->getOperand(1);
23330 SDValue RHS = N->getOperand(2);
23331 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23332
23333 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23334 LHS->getOpcode() != ISD::SIGN_EXTEND)
23335 return SDValue();
23336
23337 SDValue Extract = LHS->getOperand(0);
23338 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23339 Extract->getValueType(0) != N->getValueType(0) ||
23340 Extract->getConstantOperandVal(1) != 0)
23341 return SDValue();
23342
23343 SDValue InnerSetCC = Extract->getOperand(0);
23344 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23345 return SDValue();
23346
23347 // By this point we've effectively got
23348 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23349 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23350 // can operate on A directly.
23351 SDValue InnerPred = InnerSetCC.getOperand(0);
23352 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23353 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23354 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23355 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23356 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23357 return Extract;
23358
23359 return SDValue();
23360}
23361
23362static SDValue
23364 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23365 "Unexpected opcode!");
23366
23367 SelectionDAG &DAG = DCI.DAG;
23368 SDValue Pred = N->getOperand(0);
23369 SDValue LHS = N->getOperand(1);
23370 SDValue RHS = N->getOperand(2);
23371 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23372
23373 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23374 return V;
23375
23376 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23377 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23378 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23379 // setcc_merge_zero(
23380 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23381 // => setcc_merge_zero(pred, ...)
23382 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23383 LHS->getOperand(0)->getOperand(0) == Pred)
23384 return LHS->getOperand(0);
23385
23386 // setcc_merge_zero(
23387 // all_active, extend(nxvNi1 ...), != splat(0))
23388 // -> nxvNi1 ...
23389 if (isAllActivePredicate(DAG, Pred))
23390 return LHS->getOperand(0);
23391
23392 // setcc_merge_zero(
23393 // pred, extend(nxvNi1 ...), != splat(0))
23394 // -> nxvNi1 and(pred, ...)
23395 if (DCI.isAfterLegalizeDAG())
23396 // Do this after legalization to allow more folds on setcc_merge_zero
23397 // to be recognized.
23398 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23399 LHS->getOperand(0), Pred);
23400 }
23401
23402 return SDValue();
23403}
23404
23405// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23406// as well as whether the test should be inverted. This code is required to
23407// catch these cases (as opposed to standard dag combines) because
23408// AArch64ISD::TBZ is matched during legalization.
23409static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23410 SelectionDAG &DAG) {
23411
23412 if (!Op->hasOneUse())
23413 return Op;
23414
23415 // We don't handle undef/constant-fold cases below, as they should have
23416 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23417 // etc.)
23418
23419 // (tbz (trunc x), b) -> (tbz x, b)
23420 // This case is just here to enable more of the below cases to be caught.
23421 if (Op->getOpcode() == ISD::TRUNCATE &&
23422 Bit < Op->getValueType(0).getSizeInBits()) {
23423 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23424 }
23425
23426 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23427 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23428 Bit < Op->getOperand(0).getValueSizeInBits()) {
23429 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23430 }
23431
23432 if (Op->getNumOperands() != 2)
23433 return Op;
23434
23435 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23436 if (!C)
23437 return Op;
23438
23439 switch (Op->getOpcode()) {
23440 default:
23441 return Op;
23442
23443 // (tbz (and x, m), b) -> (tbz x, b)
23444 case ISD::AND:
23445 if ((C->getZExtValue() >> Bit) & 1)
23446 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23447 return Op;
23448
23449 // (tbz (shl x, c), b) -> (tbz x, b-c)
23450 case ISD::SHL:
23451 if (C->getZExtValue() <= Bit &&
23452 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23453 Bit = Bit - C->getZExtValue();
23454 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23455 }
23456 return Op;
23457
23458 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23459 case ISD::SRA:
23460 Bit = Bit + C->getZExtValue();
23461 if (Bit >= Op->getValueType(0).getSizeInBits())
23462 Bit = Op->getValueType(0).getSizeInBits() - 1;
23463 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23464
23465 // (tbz (srl x, c), b) -> (tbz x, b+c)
23466 case ISD::SRL:
23467 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23468 Bit = Bit + C->getZExtValue();
23469 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23470 }
23471 return Op;
23472
23473 // (tbz (xor x, -1), b) -> (tbnz x, b)
23474 case ISD::XOR:
23475 if ((C->getZExtValue() >> Bit) & 1)
23476 Invert = !Invert;
23477 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23478 }
23479}
23480
23481// Optimize test single bit zero/non-zero and branch.
23484 SelectionDAG &DAG) {
23485 unsigned Bit = N->getConstantOperandVal(2);
23486 bool Invert = false;
23487 SDValue TestSrc = N->getOperand(1);
23488 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23489
23490 if (TestSrc == NewTestSrc)
23491 return SDValue();
23492
23493 unsigned NewOpc = N->getOpcode();
23494 if (Invert) {
23495 if (NewOpc == AArch64ISD::TBZ)
23496 NewOpc = AArch64ISD::TBNZ;
23497 else {
23498 assert(NewOpc == AArch64ISD::TBNZ);
23499 NewOpc = AArch64ISD::TBZ;
23500 }
23501 }
23502
23503 SDLoc DL(N);
23504 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23505 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23506}
23507
23508// Swap vselect operands where it may allow a predicated operation to achieve
23509// the `sel`.
23510//
23511// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23512// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23514 auto SelectA = N->getOperand(1);
23515 auto SelectB = N->getOperand(2);
23516 auto NTy = N->getValueType(0);
23517
23518 if (!NTy.isScalableVector())
23519 return SDValue();
23520 SDValue SetCC = N->getOperand(0);
23521 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23522 return SDValue();
23523
23524 switch (SelectB.getOpcode()) {
23525 default:
23526 return SDValue();
23527 case ISD::FMUL:
23528 case ISD::FSUB:
23529 case ISD::FADD:
23530 break;
23531 }
23532 if (SelectA != SelectB.getOperand(0))
23533 return SDValue();
23534
23535 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23536 ISD::CondCode InverseCC =
23538 auto InverseSetCC =
23539 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23540 SetCC.getOperand(1), InverseCC);
23541
23542 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23543 {InverseSetCC, SelectB, SelectA});
23544}
23545
23546// vselect (v1i1 setcc) ->
23547// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23548// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23549// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23550// such VSELECT.
23552 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23553 return SwapResult;
23554
23555 SDValue N0 = N->getOperand(0);
23556 EVT CCVT = N0.getValueType();
23557
23558 if (isAllActivePredicate(DAG, N0))
23559 return N->getOperand(1);
23560
23561 if (isAllInactivePredicate(N0))
23562 return N->getOperand(2);
23563
23564 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23565 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23566 // supported types.
23567 SDValue SetCC = N->getOperand(0);
23568 if (SetCC.getOpcode() == ISD::SETCC &&
23569 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23570 SDValue CmpLHS = SetCC.getOperand(0);
23571 EVT VT = CmpLHS.getValueType();
23572 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23573 SDNode *SplatLHS = N->getOperand(1).getNode();
23574 SDNode *SplatRHS = N->getOperand(2).getNode();
23575 APInt SplatLHSVal;
23576 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23577 VT.isSimple() &&
23578 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23579 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23580 VT.getSimpleVT().SimpleTy) &&
23581 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23582 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23584 unsigned NumElts = VT.getVectorNumElements();
23586 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23587 VT.getScalarType()));
23588 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23589
23590 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23591 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23592 return Or;
23593 }
23594 }
23595
23596 EVT CmpVT = N0.getOperand(0).getValueType();
23597 if (N0.getOpcode() != ISD::SETCC ||
23599 CCVT.getVectorElementType() != MVT::i1 ||
23601 return SDValue();
23602
23603 EVT ResVT = N->getValueType(0);
23604 // Only combine when the result type is of the same size as the compared
23605 // operands.
23606 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23607 return SDValue();
23608
23609 SDValue IfTrue = N->getOperand(1);
23610 SDValue IfFalse = N->getOperand(2);
23611 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23612 N0.getOperand(0), N0.getOperand(1),
23613 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23614 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23615 IfTrue, IfFalse);
23616}
23617
23618/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23619/// the compare-mask instructions rather than going via NZCV, even if LHS and
23620/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23621/// with a vector one followed by a DUP shuffle on the result.
23624 SelectionDAG &DAG = DCI.DAG;
23625 SDValue N0 = N->getOperand(0);
23626 EVT ResVT = N->getValueType(0);
23627
23628 if (N0.getOpcode() != ISD::SETCC)
23629 return SDValue();
23630
23631 if (ResVT.isScalableVT())
23632 return SDValue();
23633
23634 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23635 // scalar SetCCResultType. We also don't expect vectors, because we assume
23636 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23637 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23638 "Scalar-SETCC feeding SELECT has unexpected result type!");
23639
23640 // If NumMaskElts == 0, the comparison is larger than select result. The
23641 // largest real NEON comparison is 64-bits per lane, which means the result is
23642 // at most 32-bits and an illegal vector. Just bail out for now.
23643 EVT SrcVT = N0.getOperand(0).getValueType();
23644
23645 // Don't try to do this optimization when the setcc itself has i1 operands.
23646 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23647 // ruled out to prevent the creation of setcc that need to be scalarized.
23648 if (SrcVT == MVT::i1 ||
23649 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23650 return SDValue();
23651
23652 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23653 if (!ResVT.isVector() || NumMaskElts == 0)
23654 return SDValue();
23655
23656 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23658
23659 // Also bail out if the vector CCVT isn't the same size as ResVT.
23660 // This can happen if the SETCC operand size doesn't divide the ResVT size
23661 // (e.g., f64 vs v3f32).
23662 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23663 return SDValue();
23664
23665 // Make sure we didn't create illegal types, if we're not supposed to.
23666 assert(DCI.isBeforeLegalize() ||
23667 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23668
23669 // First perform a vector comparison, where lane 0 is the one we're interested
23670 // in.
23671 SDLoc DL(N0);
23672 SDValue LHS =
23673 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23674 SDValue RHS =
23675 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23676 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23677
23678 // Now duplicate the comparison mask we want across all other lanes.
23679 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23680 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23681 Mask = DAG.getNode(ISD::BITCAST, DL,
23682 ResVT.changeVectorElementTypeToInteger(), Mask);
23683
23684 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23685}
23686
23689 EVT VT = N->getValueType(0);
23690 SDLoc DL(N);
23691 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23692 // 128bit vector version.
23693 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23695 SmallVector<SDValue> Ops(N->ops());
23696 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23697 DCI.DAG.getVTList(LVT), Ops)) {
23698 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23699 DCI.DAG.getConstant(0, DL, MVT::i64));
23700 }
23701 }
23702
23703 if (N->getOpcode() == AArch64ISD::DUP) {
23704 if (DCI.isAfterLegalizeDAG()) {
23705 // If scalar dup's operand is extract_vector_elt, try to combine them into
23706 // duplane. For example,
23707 //
23708 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23709 // t18: v4i32 = AArch64ISD::DUP t21
23710 // ==>
23711 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23712 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23713 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23714 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23715 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23716 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23717 EXTRACT_VEC_ELT.getOperand(1));
23718 }
23719 }
23720 }
23721
23722 return performPostLD1Combine(N, DCI, false);
23723 }
23724
23725 return SDValue();
23726}
23727
23728/// Get rid of unnecessary NVCASTs (that don't change the type).
23730 if (N->getValueType(0) == N->getOperand(0).getValueType())
23731 return N->getOperand(0);
23732 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23733 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23734 N->getOperand(0).getOperand(0));
23735
23736 return SDValue();
23737}
23738
23739// If all users of the globaladdr are of the form (globaladdr + constant), find
23740// the smallest constant, fold it into the globaladdr's offset and rewrite the
23741// globaladdr as (globaladdr + constant) - constant.
23743 const AArch64Subtarget *Subtarget,
23744 const TargetMachine &TM) {
23745 auto *GN = cast<GlobalAddressSDNode>(N);
23746 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23748 return SDValue();
23749
23750 uint64_t MinOffset = -1ull;
23751 for (SDNode *N : GN->uses()) {
23752 if (N->getOpcode() != ISD::ADD)
23753 return SDValue();
23754 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23755 if (!C)
23756 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23757 if (!C)
23758 return SDValue();
23759 MinOffset = std::min(MinOffset, C->getZExtValue());
23760 }
23761 uint64_t Offset = MinOffset + GN->getOffset();
23762
23763 // Require that the new offset is larger than the existing one. Otherwise, we
23764 // can end up oscillating between two possible DAGs, for example,
23765 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23766 if (Offset <= uint64_t(GN->getOffset()))
23767 return SDValue();
23768
23769 // Check whether folding this offset is legal. It must not go out of bounds of
23770 // the referenced object to avoid violating the code model, and must be
23771 // smaller than 2^20 because this is the largest offset expressible in all
23772 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23773 // stores an immediate signed 21 bit offset.)
23774 //
23775 // This check also prevents us from folding negative offsets, which will end
23776 // up being treated in the same way as large positive ones. They could also
23777 // cause code model violations, and aren't really common enough to matter.
23778 if (Offset >= (1 << 20))
23779 return SDValue();
23780
23781 const GlobalValue *GV = GN->getGlobal();
23782 Type *T = GV->getValueType();
23783 if (!T->isSized() ||
23785 return SDValue();
23786
23787 SDLoc DL(GN);
23788 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23789 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23790 DAG.getConstant(MinOffset, DL, MVT::i64));
23791}
23792
23794 const AArch64Subtarget *Subtarget) {
23795 SDValue BR = N->getOperand(0);
23796 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23797 !BR.getValueType().isScalarInteger())
23798 return SDValue();
23799
23800 SDLoc DL(N);
23801 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23802}
23803
23804// Turns the vector of indices into a vector of byte offstes by scaling Offset
23805// by (BitWidth / 8).
23807 SDLoc DL, unsigned BitWidth) {
23808 assert(Offset.getValueType().isScalableVector() &&
23809 "This method is only for scalable vectors of offsets");
23810
23811 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23812 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23813
23814 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23815}
23816
23817/// Check if the value of \p OffsetInBytes can be used as an immediate for
23818/// the gather load/prefetch and scatter store instructions with vector base and
23819/// immediate offset addressing mode:
23820///
23821/// [<Zn>.[S|D]{, #<imm>}]
23822///
23823/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23824inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23825 unsigned ScalarSizeInBytes) {
23826 // The immediate is not a multiple of the scalar size.
23827 if (OffsetInBytes % ScalarSizeInBytes)
23828 return false;
23829
23830 // The immediate is out of range.
23831 if (OffsetInBytes / ScalarSizeInBytes > 31)
23832 return false;
23833
23834 return true;
23835}
23836
23837/// Check if the value of \p Offset represents a valid immediate for the SVE
23838/// gather load/prefetch and scatter store instructiona with vector base and
23839/// immediate offset addressing mode:
23840///
23841/// [<Zn>.[S|D]{, #<imm>}]
23842///
23843/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23845 unsigned ScalarSizeInBytes) {
23846 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23847 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23848 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23849}
23850
23852 unsigned Opcode,
23853 bool OnlyPackedOffsets = true) {
23854 const SDValue Src = N->getOperand(2);
23855 const EVT SrcVT = Src->getValueType(0);
23856 assert(SrcVT.isScalableVector() &&
23857 "Scatter stores are only possible for SVE vectors");
23858
23859 SDLoc DL(N);
23860 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23861
23862 // Make sure that source data will fit into an SVE register
23864 return SDValue();
23865
23866 // For FPs, ACLE only supports _packed_ single and double precision types.
23867 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23868 if (SrcElVT.isFloatingPoint())
23869 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23870 ((Opcode != AArch64ISD::SST1Q_PRED &&
23871 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23872 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23873 return SDValue();
23874
23875 // Depending on the addressing mode, this is either a pointer or a vector of
23876 // pointers (that fits into one register)
23877 SDValue Base = N->getOperand(4);
23878 // Depending on the addressing mode, this is either a single offset or a
23879 // vector of offsets (that fits into one register)
23880 SDValue Offset = N->getOperand(5);
23881
23882 // For "scalar + vector of indices", just scale the indices. This only
23883 // applies to non-temporal scatters because there's no instruction that takes
23884 // indices.
23885 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23886 Offset =
23888 Opcode = AArch64ISD::SSTNT1_PRED;
23889 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23890 Offset =
23892 Opcode = AArch64ISD::SST1Q_PRED;
23893 }
23894
23895 // In the case of non-temporal gather loads there's only one SVE instruction
23896 // per data-size: "scalar + vector", i.e.
23897 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23898 // Since we do have intrinsics that allow the arguments to be in a different
23899 // order, we may need to swap them to match the spec.
23900 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23901 Offset.getValueType().isVector())
23903
23904 // SST1_IMM requires that the offset is an immediate that is:
23905 // * a multiple of #SizeInBytes,
23906 // * in the range [0, 31 x #SizeInBytes],
23907 // where #SizeInBytes is the size in bytes of the stored items. For
23908 // immediates outside that range and non-immediate scalar offsets use SST1 or
23909 // SST1_UXTW instead.
23910 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23912 SrcVT.getScalarSizeInBits() / 8)) {
23913 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23915 else
23916 Opcode = AArch64ISD::SST1_PRED;
23917
23919 }
23920 }
23921
23922 auto &TLI = DAG.getTargetLoweringInfo();
23923 if (!TLI.isTypeLegal(Base.getValueType()))
23924 return SDValue();
23925
23926 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23927 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23928 // nxv2i64. Legalize accordingly.
23929 if (!OnlyPackedOffsets &&
23930 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23931 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23932
23933 if (!TLI.isTypeLegal(Offset.getValueType()))
23934 return SDValue();
23935
23936 // Source value type that is representable in hardware
23937 EVT HwSrcVt = getSVEContainerType(SrcVT);
23938
23939 // Keep the original type of the input data to store - this is needed to be
23940 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23941 // FP values we want the integer equivalent, so just use HwSrcVt.
23942 SDValue InputVT = DAG.getValueType(SrcVT);
23943 if (SrcVT.isFloatingPoint())
23944 InputVT = DAG.getValueType(HwSrcVt);
23945
23946 SDVTList VTs = DAG.getVTList(MVT::Other);
23947 SDValue SrcNew;
23948
23949 if (Src.getValueType().isFloatingPoint())
23950 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23951 else
23952 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23953
23954 SDValue Ops[] = {N->getOperand(0), // Chain
23955 SrcNew,
23956 N->getOperand(3), // Pg
23957 Base,
23958 Offset,
23959 InputVT};
23960
23961 return DAG.getNode(Opcode, DL, VTs, Ops);
23962}
23963
23965 unsigned Opcode,
23966 bool OnlyPackedOffsets = true) {
23967 const EVT RetVT = N->getValueType(0);
23968 assert(RetVT.isScalableVector() &&
23969 "Gather loads are only possible for SVE vectors");
23970
23971 SDLoc DL(N);
23972
23973 // Make sure that the loaded data will fit into an SVE register
23975 return SDValue();
23976
23977 // Depending on the addressing mode, this is either a pointer or a vector of
23978 // pointers (that fits into one register)
23979 SDValue Base = N->getOperand(3);
23980 // Depending on the addressing mode, this is either a single offset or a
23981 // vector of offsets (that fits into one register)
23982 SDValue Offset = N->getOperand(4);
23983
23984 // For "scalar + vector of indices", scale the indices to obtain unscaled
23985 // offsets. This applies to non-temporal and quadword gathers, which do not
23986 // have an addressing mode with scaled offset.
23989 RetVT.getScalarSizeInBits());
23991 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23993 RetVT.getScalarSizeInBits());
23995 }
23996
23997 // In the case of non-temporal gather loads and quadword gather loads there's
23998 // only one addressing mode : "vector + scalar", e.g.
23999 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24000 // Since we do have intrinsics that allow the arguments to be in a different
24001 // order, we may need to swap them to match the spec.
24002 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
24003 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24004 Offset.getValueType().isVector())
24006
24007 // GLD{FF}1_IMM requires that the offset is an immediate that is:
24008 // * a multiple of #SizeInBytes,
24009 // * in the range [0, 31 x #SizeInBytes],
24010 // where #SizeInBytes is the size in bytes of the loaded items. For
24011 // immediates outside that range and non-immediate scalar offsets use
24012 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24013 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
24016 RetVT.getScalarSizeInBits() / 8)) {
24017 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24018 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24021 else
24022 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24025
24027 }
24028 }
24029
24030 auto &TLI = DAG.getTargetLoweringInfo();
24031 if (!TLI.isTypeLegal(Base.getValueType()))
24032 return SDValue();
24033
24034 // Some gather load variants allow unpacked offsets, but only as nxv2i32
24035 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24036 // nxv2i64. Legalize accordingly.
24037 if (!OnlyPackedOffsets &&
24038 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24039 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24040
24041 // Return value type that is representable in hardware
24042 EVT HwRetVt = getSVEContainerType(RetVT);
24043
24044 // Keep the original output value type around - this is needed to be able to
24045 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24046 // values we want the integer equivalent, so just use HwRetVT.
24047 SDValue OutVT = DAG.getValueType(RetVT);
24048 if (RetVT.isFloatingPoint())
24049 OutVT = DAG.getValueType(HwRetVt);
24050
24051 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
24052 SDValue Ops[] = {N->getOperand(0), // Chain
24053 N->getOperand(2), // Pg
24054 Base, Offset, OutVT};
24055
24056 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
24057 SDValue LoadChain = SDValue(Load.getNode(), 1);
24058
24059 if (RetVT.isInteger() && (RetVT != HwRetVt))
24060 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
24061
24062 // If the original return value was FP, bitcast accordingly. Doing it here
24063 // means that we can avoid adding TableGen patterns for FPs.
24064 if (RetVT.isFloatingPoint())
24065 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
24066
24067 return DAG.getMergeValues({Load, LoadChain}, DL);
24068}
24069
24070static SDValue
24072 SelectionDAG &DAG) {
24073 SDLoc DL(N);
24074 SDValue Src = N->getOperand(0);
24075 unsigned Opc = Src->getOpcode();
24076
24077 // Sign extend of an unsigned unpack -> signed unpack
24078 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24079
24080 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24082
24083 // Push the sign extend to the operand of the unpack
24084 // This is necessary where, for example, the operand of the unpack
24085 // is another unpack:
24086 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24087 // ->
24088 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24089 // ->
24090 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24091 SDValue ExtOp = Src->getOperand(0);
24092 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24093 EVT EltTy = VT.getVectorElementType();
24094 (void)EltTy;
24095
24096 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24097 "Sign extending from an invalid type");
24098
24099 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24100
24102 ExtOp, DAG.getValueType(ExtVT));
24103
24104 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24105 }
24106
24107 if (DCI.isBeforeLegalizeOps())
24108 return SDValue();
24109
24111 return SDValue();
24112
24113 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24114 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24115 unsigned NewOpc;
24116 unsigned MemVTOpNum = 4;
24117 switch (Opc) {
24120 MemVTOpNum = 3;
24121 break;
24124 MemVTOpNum = 3;
24125 break;
24128 MemVTOpNum = 3;
24129 break;
24132 break;
24135 break;
24138 break;
24141 break;
24144 break;
24147 break;
24150 break;
24153 break;
24156 break;
24159 break;
24162 break;
24165 break;
24168 break;
24171 break;
24174 break;
24175 default:
24176 return SDValue();
24177 }
24178
24179 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24180 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24181
24182 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24183 return SDValue();
24184
24185 EVT DstVT = N->getValueType(0);
24186 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24187
24189 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24190 Ops.push_back(Src->getOperand(I));
24191
24192 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24193 DCI.CombineTo(N, ExtLoad);
24194 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24195
24196 // Return N so it doesn't get rechecked
24197 return SDValue(N, 0);
24198}
24199
24200/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24201/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24202/// != nxv2i32) do not need legalization.
24204 const unsigned OffsetPos = 4;
24205 SDValue Offset = N->getOperand(OffsetPos);
24206
24207 // Not an unpacked vector, bail out.
24208 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24209 return SDValue();
24210
24211 // Extend the unpacked offset vector to 64-bit lanes.
24212 SDLoc DL(N);
24213 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24214 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24215 // Replace the offset operand with the 64-bit one.
24216 Ops[OffsetPos] = Offset;
24217
24218 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24219}
24220
24221/// Combines a node carrying the intrinsic
24222/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24223/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24224/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24225/// sve gather prefetch instruction with vector plus immediate addressing mode.
24227 unsigned ScalarSizeInBytes) {
24228 const unsigned ImmPos = 4, OffsetPos = 3;
24229 // No need to combine the node if the immediate is valid...
24230 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24231 return SDValue();
24232
24233 // ...otherwise swap the offset base with the offset...
24234 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24235 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24236 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24237 // `aarch64_sve_prfb_gather_uxtw_index`.
24238 SDLoc DL(N);
24239 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24240 MVT::i64);
24241
24242 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24243}
24244
24245// Return true if the vector operation can guarantee only the first lane of its
24246// result contains data, with all bits in other lanes set to zero.
24248 switch (Op.getOpcode()) {
24249 default:
24250 return false;
24266 return true;
24267 }
24268}
24269
24271 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24272 SDValue InsertVec = N->getOperand(0);
24273 SDValue InsertElt = N->getOperand(1);
24274 SDValue InsertIdx = N->getOperand(2);
24275
24276 // We only care about inserts into the first element...
24277 if (!isNullConstant(InsertIdx))
24278 return SDValue();
24279 // ...of a zero'd vector...
24281 return SDValue();
24282 // ...where the inserted data was previously extracted...
24283 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24284 return SDValue();
24285
24286 SDValue ExtractVec = InsertElt.getOperand(0);
24287 SDValue ExtractIdx = InsertElt.getOperand(1);
24288
24289 // ...from the first element of a vector.
24290 if (!isNullConstant(ExtractIdx))
24291 return SDValue();
24292
24293 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24294
24295 // Ensure there's no type conversion going on.
24296 if (N->getValueType(0) != ExtractVec.getValueType())
24297 return SDValue();
24298
24299 if (!isLanes1toNKnownZero(ExtractVec))
24300 return SDValue();
24301
24302 // The explicit zeroing is redundant.
24303 return ExtractVec;
24304}
24305
24306static SDValue
24309 return Res;
24310
24311 return performPostLD1Combine(N, DCI, true);
24312}
24313
24316 const AArch64Subtarget *Subtarget) {
24317 SDValue N0 = N->getOperand(0);
24318 EVT VT = N->getValueType(0);
24319
24320 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24321 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24322 return SDValue();
24323
24324 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24325 EVT EltVT = VT.getVectorElementType();
24326 return EltVT == MVT::f32 || EltVT == MVT::f64;
24327 };
24328
24329 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24330 // We purposefully don't care about legality of the nodes here as we know
24331 // they can be split down into something legal.
24332 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24333 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24334 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24335 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24336 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24337 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24338 LN0->getChain(), LN0->getBasePtr(),
24339 N0.getValueType(), LN0->getMemOperand());
24340 DCI.CombineTo(N, ExtLoad);
24341 DCI.CombineTo(
24342 N0.getNode(),
24343 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24344 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24345 ExtLoad.getValue(1));
24346 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24347 }
24348
24349 return SDValue();
24350}
24351
24353 const AArch64Subtarget *Subtarget) {
24354 EVT VT = N->getValueType(0);
24355
24356 // Don't expand for NEON, SVE2 or SME
24357 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24358 return SDValue();
24359
24360 SDLoc DL(N);
24361
24362 SDValue Mask = N->getOperand(0);
24363 SDValue In1 = N->getOperand(1);
24364 SDValue In2 = N->getOperand(2);
24365
24366 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24367 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24368 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24369 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24370}
24371
24373 EVT VT = N->getValueType(0);
24374
24375 SDValue Insert = N->getOperand(0);
24376 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24377 return SDValue();
24378
24379 if (!Insert.getOperand(0).isUndef())
24380 return SDValue();
24381
24382 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24383 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24384 if (IdxInsert != 0 || IdxDupLane != 0)
24385 return SDValue();
24386
24387 SDValue Bitcast = Insert.getOperand(1);
24388 if (Bitcast.getOpcode() != ISD::BITCAST)
24389 return SDValue();
24390
24391 SDValue Subvec = Bitcast.getOperand(0);
24392 EVT SubvecVT = Subvec.getValueType();
24393 if (!SubvecVT.is128BitVector())
24394 return SDValue();
24395 EVT NewSubvecVT =
24397
24398 SDLoc DL(N);
24399 SDValue NewInsert =
24400 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24401 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24402 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24403 NewInsert, N->getOperand(1));
24404 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24405}
24406
24407// Try to combine mull with uzp1.
24410 SelectionDAG &DAG) {
24411 if (DCI.isBeforeLegalizeOps())
24412 return SDValue();
24413
24414 SDValue LHS = N->getOperand(0);
24415 SDValue RHS = N->getOperand(1);
24416
24417 SDValue ExtractHigh;
24418 SDValue ExtractLow;
24419 SDValue TruncHigh;
24420 SDValue TruncLow;
24421 SDLoc DL(N);
24422
24423 // Check the operands are trunc and extract_high.
24425 RHS.getOpcode() == ISD::TRUNCATE) {
24426 TruncHigh = RHS;
24427 if (LHS.getOpcode() == ISD::BITCAST)
24428 ExtractHigh = LHS.getOperand(0);
24429 else
24430 ExtractHigh = LHS;
24432 LHS.getOpcode() == ISD::TRUNCATE) {
24433 TruncHigh = LHS;
24434 if (LHS.getOpcode() == ISD::BITCAST)
24435 ExtractHigh = RHS.getOperand(0);
24436 else
24437 ExtractHigh = RHS;
24438 } else
24439 return SDValue();
24440
24441 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24442 // with uzp1.
24443 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24444 SDValue TruncHighOp = TruncHigh.getOperand(0);
24445 EVT TruncHighOpVT = TruncHighOp.getValueType();
24446 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24447 DAG.isSplatValue(TruncHighOp, false))
24448 return SDValue();
24449
24450 // Check there is other extract_high with same source vector.
24451 // For example,
24452 //
24453 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24454 // t12: v4i16 = truncate t11
24455 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24456 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24457 // t16: v4i16 = truncate t15
24458 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24459 //
24460 // This dagcombine assumes the two extract_high uses same source vector in
24461 // order to detect the pair of the mull. If they have different source vector,
24462 // this code will not work.
24463 bool HasFoundMULLow = true;
24464 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24465 if (ExtractHighSrcVec->use_size() != 2)
24466 HasFoundMULLow = false;
24467
24468 // Find ExtractLow.
24469 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24470 if (User == ExtractHigh.getNode())
24471 continue;
24472
24473 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24475 HasFoundMULLow = false;
24476 break;
24477 }
24478
24479 ExtractLow.setNode(User);
24480 }
24481
24482 if (!ExtractLow || !ExtractLow->hasOneUse())
24483 HasFoundMULLow = false;
24484
24485 // Check ExtractLow's user.
24486 if (HasFoundMULLow) {
24487 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24488 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24489 HasFoundMULLow = false;
24490 } else {
24491 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24492 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24493 TruncLow = ExtractLowUser->getOperand(1);
24494 else
24495 HasFoundMULLow = false;
24496 } else {
24497 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24498 TruncLow = ExtractLowUser->getOperand(0);
24499 else
24500 HasFoundMULLow = false;
24501 }
24502 }
24503 }
24504
24505 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24506 // with uzp1.
24507 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24508 EVT TruncHighVT = TruncHigh.getValueType();
24509 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24510 SDValue TruncLowOp =
24511 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24512 EVT TruncLowOpVT = TruncLowOp.getValueType();
24513 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24514 DAG.isSplatValue(TruncLowOp, false)))
24515 return SDValue();
24516
24517 // Create uzp1, extract_high and extract_low.
24518 if (TruncHighOpVT != UZP1VT)
24519 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24520 if (TruncLowOpVT != UZP1VT)
24521 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24522
24523 SDValue UZP1 =
24524 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24525 SDValue HighIdxCst =
24526 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24527 SDValue NewTruncHigh =
24528 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24529 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24530
24531 if (HasFoundMULLow) {
24532 EVT TruncLowVT = TruncLow.getValueType();
24533 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24534 UZP1, ExtractLow.getOperand(1));
24535 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24536 }
24537
24538 return SDValue(N, 0);
24539}
24540
24543 SelectionDAG &DAG) {
24544 if (SDValue Val =
24546 return Val;
24547
24548 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24549 return Val;
24550
24551 return SDValue();
24552}
24553
24554static SDValue
24556 SelectionDAG &DAG) {
24557 // Let's do below transform.
24558 //
24559 // t34: v4i32 = AArch64ISD::UADDLV t2
24560 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24561 // t7: i64 = zero_extend t35
24562 // t20: v1i64 = scalar_to_vector t7
24563 // ==>
24564 // t34: v4i32 = AArch64ISD::UADDLV t2
24565 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24566 // t40: v1i64 = AArch64ISD::NVCAST t39
24567 if (DCI.isBeforeLegalizeOps())
24568 return SDValue();
24569
24570 EVT VT = N->getValueType(0);
24571 if (VT != MVT::v1i64)
24572 return SDValue();
24573
24574 SDValue ZEXT = N->getOperand(0);
24575 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24576 return SDValue();
24577
24578 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24579 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24580 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24581 return SDValue();
24582
24583 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24584 return SDValue();
24585
24586 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24587 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24588 UADDLV.getValueType() != MVT::v4i32 ||
24589 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24590 return SDValue();
24591
24592 // Let's generate new sequence with AArch64ISD::NVCAST.
24593 SDLoc DL(N);
24594 SDValue EXTRACT_SUBVEC =
24595 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24596 DAG.getConstant(0, DL, MVT::i64));
24597 SDValue NVCAST =
24598 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24599
24600 return NVCAST;
24601}
24602
24604 DAGCombinerInfo &DCI) const {
24605 SelectionDAG &DAG = DCI.DAG;
24606 switch (N->getOpcode()) {
24607 default:
24608 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24609 break;
24610 case ISD::VECREDUCE_AND:
24611 case ISD::VECREDUCE_OR:
24612 case ISD::VECREDUCE_XOR:
24613 return performVecReduceBitwiseCombine(N, DCI, DAG);
24614 case ISD::ADD:
24615 case ISD::SUB:
24616 return performAddSubCombine(N, DCI);
24617 case ISD::BUILD_VECTOR:
24618 return performBuildVectorCombine(N, DCI, DAG);
24619 case ISD::TRUNCATE:
24620 return performTruncateCombine(N, DAG);
24621 case AArch64ISD::ANDS:
24622 return performFlagSettingCombine(N, DCI, ISD::AND);
24623 case AArch64ISD::ADC:
24624 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24625 return R;
24626 return foldADCToCINC(N, DAG);
24627 case AArch64ISD::SBC:
24628 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24629 case AArch64ISD::ADCS:
24630 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24631 return R;
24633 case AArch64ISD::SBCS:
24634 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24635 return R;
24637 case AArch64ISD::BICi: {
24639 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
24640 APInt DemandedElts =
24641 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
24642
24644 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24645 return SDValue();
24646
24647 break;
24648 }
24649 case ISD::XOR:
24650 return performXorCombine(N, DAG, DCI, Subtarget);
24651 case ISD::MUL:
24652 return performMulCombine(N, DAG, DCI, Subtarget);
24653 case ISD::SINT_TO_FP:
24654 case ISD::UINT_TO_FP:
24655 return performIntToFpCombine(N, DAG, Subtarget);
24656 case ISD::FP_TO_SINT:
24657 case ISD::FP_TO_UINT:
24660 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24661 case ISD::OR:
24662 return performORCombine(N, DCI, Subtarget, *this);
24663 case ISD::AND:
24664 return performANDCombine(N, DCI);
24665 case ISD::FADD:
24666 return performFADDCombine(N, DCI);
24668 return performIntrinsicCombine(N, DCI, Subtarget);
24669 case ISD::ANY_EXTEND:
24670 case ISD::ZERO_EXTEND:
24671 case ISD::SIGN_EXTEND:
24672 return performExtendCombine(N, DCI, DAG);
24674 return performSignExtendInRegCombine(N, DCI, DAG);
24676 return performConcatVectorsCombine(N, DCI, DAG);
24678 return performExtractSubvectorCombine(N, DCI, DAG);
24680 return performInsertSubvectorCombine(N, DCI, DAG);
24681 case ISD::SELECT:
24682 return performSelectCombine(N, DCI);
24683 case ISD::VSELECT:
24684 return performVSelectCombine(N, DCI.DAG);
24685 case ISD::SETCC:
24686 return performSETCCCombine(N, DCI, DAG);
24687 case ISD::LOAD:
24688 return performLOADCombine(N, DCI, DAG, Subtarget);
24689 case ISD::STORE:
24690 return performSTORECombine(N, DCI, DAG, Subtarget);
24691 case ISD::MSTORE:
24692 return performMSTORECombine(N, DCI, DAG, Subtarget);
24693 case ISD::MGATHER:
24694 case ISD::MSCATTER:
24695 return performMaskedGatherScatterCombine(N, DCI, DAG);
24696 case ISD::FP_EXTEND:
24697 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24698 case AArch64ISD::BRCOND:
24699 return performBRCONDCombine(N, DCI, DAG);
24700 case AArch64ISD::TBNZ:
24701 case AArch64ISD::TBZ:
24702 return performTBZCombine(N, DCI, DAG);
24703 case AArch64ISD::CSEL:
24704 return performCSELCombine(N, DCI, DAG);
24705 case AArch64ISD::DUP:
24710 return performDUPCombine(N, DCI);
24712 return performDupLane128Combine(N, DAG);
24713 case AArch64ISD::NVCAST:
24714 return performNVCASTCombine(N, DAG);
24715 case AArch64ISD::SPLICE:
24716 return performSpliceCombine(N, DAG);
24719 return performUnpackCombine(N, DAG, Subtarget);
24720 case AArch64ISD::UZP1:
24721 case AArch64ISD::UZP2:
24722 return performUzpCombine(N, DAG, Subtarget);
24724 return performSetccMergeZeroCombine(N, DCI);
24741 return performGLD1Combine(N, DAG);
24742 case AArch64ISD::VASHR:
24743 case AArch64ISD::VLSHR:
24744 return performVectorShiftCombine(N, *this, DCI);
24746 return performSunpkloCombine(N, DAG);
24747 case AArch64ISD::BSP:
24748 return performBSPExpandForSVE(N, DAG, Subtarget);
24750 return performInsertVectorEltCombine(N, DCI);
24752 return performExtractVectorEltCombine(N, DCI, Subtarget);
24753 case ISD::VECREDUCE_ADD:
24754 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24755 case AArch64ISD::UADDV:
24756 return performUADDVCombine(N, DAG);
24757 case AArch64ISD::SMULL:
24758 case AArch64ISD::UMULL:
24759 case AArch64ISD::PMULL:
24760 return performMULLCombine(N, DCI, DAG);
24763 switch (N->getConstantOperandVal(1)) {
24764 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24765 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24766 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24767 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24768 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24769 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24770 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24771 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24772 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24773 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24774 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24775 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24776 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24777 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24778 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24779 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24781 case Intrinsic::aarch64_neon_ld2:
24782 case Intrinsic::aarch64_neon_ld3:
24783 case Intrinsic::aarch64_neon_ld4:
24784 case Intrinsic::aarch64_neon_ld1x2:
24785 case Intrinsic::aarch64_neon_ld1x3:
24786 case Intrinsic::aarch64_neon_ld1x4:
24787 case Intrinsic::aarch64_neon_ld2lane:
24788 case Intrinsic::aarch64_neon_ld3lane:
24789 case Intrinsic::aarch64_neon_ld4lane:
24790 case Intrinsic::aarch64_neon_ld2r:
24791 case Intrinsic::aarch64_neon_ld3r:
24792 case Intrinsic::aarch64_neon_ld4r:
24793 case Intrinsic::aarch64_neon_st2:
24794 case Intrinsic::aarch64_neon_st3:
24795 case Intrinsic::aarch64_neon_st4:
24796 case Intrinsic::aarch64_neon_st1x2:
24797 case Intrinsic::aarch64_neon_st1x3:
24798 case Intrinsic::aarch64_neon_st1x4:
24799 case Intrinsic::aarch64_neon_st2lane:
24800 case Intrinsic::aarch64_neon_st3lane:
24801 case Intrinsic::aarch64_neon_st4lane:
24802 return performNEONPostLDSTCombine(N, DCI, DAG);
24803 case Intrinsic::aarch64_sve_ldnt1:
24804 return performLDNT1Combine(N, DAG);
24805 case Intrinsic::aarch64_sve_ld1rq:
24806 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24807 case Intrinsic::aarch64_sve_ld1ro:
24808 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24809 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24811 case Intrinsic::aarch64_sve_ldnt1_gather:
24813 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24814 return performGatherLoadCombine(N, DAG,
24816 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24818 case Intrinsic::aarch64_sve_ld1:
24820 case Intrinsic::aarch64_sve_ldnf1:
24822 case Intrinsic::aarch64_sve_ldff1:
24824 case Intrinsic::aarch64_sve_st1:
24825 return performST1Combine(N, DAG);
24826 case Intrinsic::aarch64_sve_stnt1:
24827 return performSTNT1Combine(N, DAG);
24828 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24830 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24832 case Intrinsic::aarch64_sve_stnt1_scatter:
24834 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24836 case Intrinsic::aarch64_sve_ld1_gather:
24838 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24839 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24841 case Intrinsic::aarch64_sve_ld1q_gather_index:
24842 return performGatherLoadCombine(N, DAG,
24844 case Intrinsic::aarch64_sve_ld1_gather_index:
24845 return performGatherLoadCombine(N, DAG,
24847 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24849 /*OnlyPackedOffsets=*/false);
24850 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24852 /*OnlyPackedOffsets=*/false);
24853 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24854 return performGatherLoadCombine(N, DAG,
24856 /*OnlyPackedOffsets=*/false);
24857 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24858 return performGatherLoadCombine(N, DAG,
24860 /*OnlyPackedOffsets=*/false);
24861 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24863 case Intrinsic::aarch64_sve_ldff1_gather:
24865 case Intrinsic::aarch64_sve_ldff1_gather_index:
24866 return performGatherLoadCombine(N, DAG,
24868 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24869 return performGatherLoadCombine(N, DAG,
24871 /*OnlyPackedOffsets=*/false);
24872 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24873 return performGatherLoadCombine(N, DAG,
24875 /*OnlyPackedOffsets=*/false);
24876 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24877 return performGatherLoadCombine(N, DAG,
24879 /*OnlyPackedOffsets=*/false);
24880 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24881 return performGatherLoadCombine(N, DAG,
24883 /*OnlyPackedOffsets=*/false);
24884 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24885 return performGatherLoadCombine(N, DAG,
24887 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24888 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24890 case Intrinsic::aarch64_sve_st1q_scatter_index:
24892 case Intrinsic::aarch64_sve_st1_scatter:
24894 case Intrinsic::aarch64_sve_st1_scatter_index:
24896 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24898 /*OnlyPackedOffsets=*/false);
24899 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24901 /*OnlyPackedOffsets=*/false);
24902 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24903 return performScatterStoreCombine(N, DAG,
24905 /*OnlyPackedOffsets=*/false);
24906 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24907 return performScatterStoreCombine(N, DAG,
24909 /*OnlyPackedOffsets=*/false);
24910 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24912 case Intrinsic::aarch64_rndr:
24913 case Intrinsic::aarch64_rndrrs: {
24914 unsigned IntrinsicID = N->getConstantOperandVal(1);
24915 auto Register =
24916 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24917 : AArch64SysReg::RNDRRS);
24918 SDLoc DL(N);
24919 SDValue A = DAG.getNode(
24920 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24921 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24922 SDValue B = DAG.getNode(
24923 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24924 DAG.getConstant(0, DL, MVT::i32),
24925 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24926 return DAG.getMergeValues(
24927 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24928 }
24929 case Intrinsic::aarch64_sme_ldr_zt:
24931 DAG.getVTList(MVT::Other), N->getOperand(0),
24932 N->getOperand(2), N->getOperand(3));
24933 case Intrinsic::aarch64_sme_str_zt:
24934 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24935 DAG.getVTList(MVT::Other), N->getOperand(0),
24936 N->getOperand(2), N->getOperand(3));
24937 default:
24938 break;
24939 }
24940 break;
24941 case ISD::GlobalAddress:
24942 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24943 case ISD::CTLZ:
24944 return performCTLZCombine(N, DAG, Subtarget);
24946 return performScalarToVectorCombine(N, DCI, DAG);
24947 }
24948 return SDValue();
24949}
24950
24951// Check if the return value is used as only a return value, as otherwise
24952// we can't perform a tail-call. In particular, we need to check for
24953// target ISD nodes that are returns and any other "odd" constructs
24954// that the generic analysis code won't necessarily catch.
24955bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24956 SDValue &Chain) const {
24957 if (N->getNumValues() != 1)
24958 return false;
24959 if (!N->hasNUsesOfValue(1, 0))
24960 return false;
24961
24962 SDValue TCChain = Chain;
24963 SDNode *Copy = *N->use_begin();
24964 if (Copy->getOpcode() == ISD::CopyToReg) {
24965 // If the copy has a glue operand, we conservatively assume it isn't safe to
24966 // perform a tail call.
24967 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24968 MVT::Glue)
24969 return false;
24970 TCChain = Copy->getOperand(0);
24971 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24972 return false;
24973
24974 bool HasRet = false;
24975 for (SDNode *Node : Copy->uses()) {
24976 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24977 return false;
24978 HasRet = true;
24979 }
24980
24981 if (!HasRet)
24982 return false;
24983
24984 Chain = TCChain;
24985 return true;
24986}
24987
24988// Return whether the an instruction can potentially be optimized to a tail
24989// call. This will cause the optimizers to attempt to move, or duplicate,
24990// return instructions to help enable tail call optimizations for this
24991// instruction.
24992bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24993 return CI->isTailCall();
24994}
24995
24996bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24997 Register Offset, bool IsPre,
24998 MachineRegisterInfo &MRI) const {
24999 auto CstOffset = getIConstantVRegVal(Offset, MRI);
25000 if (!CstOffset || CstOffset->isZero())
25001 return false;
25002
25003 // All of the indexed addressing mode instructions take a signed 9 bit
25004 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25005 // encodes the sign/indexing direction.
25006 return isInt<9>(CstOffset->getSExtValue());
25007}
25008
25009bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
25010 SDValue &Base,
25011 SDValue &Offset,
25012 SelectionDAG &DAG) const {
25013 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25014 return false;
25015
25016 // Non-null if there is exactly one user of the loaded value (ignoring chain).
25017 SDNode *ValOnlyUser = nullptr;
25018 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25019 ++UI) {
25020 if (UI.getUse().getResNo() == 1)
25021 continue; // Ignore chain.
25022 if (ValOnlyUser == nullptr)
25023 ValOnlyUser = *UI;
25024 else {
25025 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25026 break;
25027 }
25028 }
25029
25030 auto IsUndefOrZero = [](SDValue V) {
25031 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
25032 };
25033
25034 // If the only user of the value is a scalable vector splat, it is
25035 // preferable to do a replicating load (ld1r*).
25036 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
25037 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25038 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25039 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
25040 return false;
25041
25042 Base = Op->getOperand(0);
25043 // All of the indexed addressing mode instructions take a signed
25044 // 9 bit immediate offset.
25045 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
25046 int64_t RHSC = RHS->getSExtValue();
25047 if (Op->getOpcode() == ISD::SUB)
25048 RHSC = -(uint64_t)RHSC;
25049 if (!isInt<9>(RHSC))
25050 return false;
25051 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25052 // when dealing with subtraction.
25053 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25054 return true;
25055 }
25056 return false;
25057}
25058
25059bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25060 SDValue &Offset,
25062 SelectionDAG &DAG) const {
25063 EVT VT;
25064 SDValue Ptr;
25065 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25066 VT = LD->getMemoryVT();
25067 Ptr = LD->getBasePtr();
25068 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25069 VT = ST->getMemoryVT();
25070 Ptr = ST->getBasePtr();
25071 } else
25072 return false;
25073
25074 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25075 return false;
25076 AM = ISD::PRE_INC;
25077 return true;
25078}
25079
25080bool AArch64TargetLowering::getPostIndexedAddressParts(
25082 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25083 EVT VT;
25084 SDValue Ptr;
25085 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25086 VT = LD->getMemoryVT();
25087 Ptr = LD->getBasePtr();
25088 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25089 VT = ST->getMemoryVT();
25090 Ptr = ST->getBasePtr();
25091 } else
25092 return false;
25093
25094 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25095 return false;
25096 // Post-indexing updates the base, so it's not a valid transform
25097 // if that's not the same as the load's pointer.
25098 if (Ptr != Base)
25099 return false;
25100 AM = ISD::POST_INC;
25101 return true;
25102}
25103
25106 SelectionDAG &DAG) {
25107 SDLoc DL(N);
25108 SDValue Op = N->getOperand(0);
25109 EVT VT = N->getValueType(0);
25110 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25111 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25112 "Must be bool vector.");
25113
25114 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25115 // elements, it adds a vector concatenation with undef(s). If we encounter
25116 // this here, we can skip the concat.
25117 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25118 bool AllUndef = true;
25119 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25120 AllUndef &= Op.getOperand(I).isUndef();
25121
25122 if (AllUndef)
25123 Op = Op.getOperand(0);
25124 }
25125
25126 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25127 if (VectorBits)
25128 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25129}
25130
25133 SelectionDAG &DAG, EVT ExtendVT,
25134 EVT CastVT) {
25135 SDLoc DL(N);
25136 SDValue Op = N->getOperand(0);
25137 EVT VT = N->getValueType(0);
25138
25139 // Use SCALAR_TO_VECTOR for lane zero
25140 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25141 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25142 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25143 Results.push_back(
25144 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25145}
25146
25147void AArch64TargetLowering::ReplaceBITCASTResults(
25149 SDLoc DL(N);
25150 SDValue Op = N->getOperand(0);
25151 EVT VT = N->getValueType(0);
25152 EVT SrcVT = Op.getValueType();
25153
25154 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25155 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25156 return;
25157 }
25158
25159 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25160 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25161 return;
25162 }
25163
25164 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25165 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25166 return;
25167 }
25168
25169 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25170 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25171 "Expected fp->int bitcast!");
25172
25173 // Bitcasting between unpacked vector types of different element counts is
25174 // not a NOP because the live elements are laid out differently.
25175 // 01234567
25176 // e.g. nxv2i32 = XX??XX??
25177 // nxv4f16 = X?X?X?X?
25178 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25179 return;
25180
25181 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25182 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25183 return;
25184 }
25185
25186 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25187 !VT.isVector())
25188 return replaceBoolVectorBitcast(N, Results, DAG);
25189
25190 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25191 return;
25192
25193 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25194 DAG.getUNDEF(MVT::i32), Op);
25195 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25196 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25197}
25198
25200 SelectionDAG &DAG,
25201 const AArch64Subtarget *Subtarget) {
25202 EVT VT = N->getValueType(0);
25203 if (!VT.is256BitVector() ||
25205 !N->getFlags().hasAllowReassociation()) ||
25206 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25207 VT.getScalarType() == MVT::bf16)
25208 return;
25209
25210 SDValue X = N->getOperand(0);
25211 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25212 if (!Shuf) {
25213 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25214 X = N->getOperand(1);
25215 if (!Shuf)
25216 return;
25217 }
25218
25219 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25220 return;
25221
25222 // Check the mask is 1,0,3,2,5,4,...
25223 ArrayRef<int> Mask = Shuf->getMask();
25224 for (int I = 0, E = Mask.size(); I < E; I++)
25225 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25226 return;
25227
25228 SDLoc DL(N);
25229 auto LoHi = DAG.SplitVector(X, DL);
25230 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25231 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25232 LoHi.first, LoHi.second);
25233
25234 // Shuffle the elements back into order.
25235 SmallVector<int> NMask;
25236 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25237 NMask.push_back(I);
25238 NMask.push_back(I);
25239 }
25240 Results.push_back(
25241 DAG.getVectorShuffle(VT, DL,
25242 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25243 DAG.getUNDEF(LoHi.first.getValueType())),
25244 DAG.getUNDEF(VT), NMask));
25245}
25246
25249 SelectionDAG &DAG, unsigned InterOp,
25250 unsigned AcrossOp) {
25251 EVT LoVT, HiVT;
25252 SDValue Lo, Hi;
25253 SDLoc dl(N);
25254 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25255 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25256 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25257 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25258 Results.push_back(SplitVal);
25259}
25260
25261void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25263 SDValue In = N->getOperand(0);
25264 EVT InVT = In.getValueType();
25265
25266 // Common code will handle these just fine.
25267 if (!InVT.isScalableVector() || !InVT.isInteger())
25268 return;
25269
25270 SDLoc DL(N);
25271 EVT VT = N->getValueType(0);
25272
25273 // The following checks bail if this is not a halving operation.
25274
25276
25277 if (InVT.getVectorElementCount() != (ResEC * 2))
25278 return;
25279
25280 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25281 if (!CIndex)
25282 return;
25283
25284 unsigned Index = CIndex->getZExtValue();
25285 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25286 return;
25287
25288 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25289 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25290
25291 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25292 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25293}
25294
25295// Create an even/odd pair of X registers holding integer value V.
25297 SDLoc dl(V.getNode());
25298 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25299 if (DAG.getDataLayout().isBigEndian())
25300 std::swap (VLo, VHi);
25301 SDValue RegClass =
25302 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25303 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25304 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25305 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25306 return SDValue(
25307 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25308}
25309
25312 SelectionDAG &DAG,
25313 const AArch64Subtarget *Subtarget) {
25314 assert(N->getValueType(0) == MVT::i128 &&
25315 "AtomicCmpSwap on types less than 128 should be legal");
25316
25317 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25318 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25319 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25320 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25321 SDValue Ops[] = {
25322 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25323 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25324 N->getOperand(1), // Ptr
25325 N->getOperand(0), // Chain in
25326 };
25327
25328 unsigned Opcode;
25329 switch (MemOp->getMergedOrdering()) {
25331 Opcode = AArch64::CASPX;
25332 break;
25334 Opcode = AArch64::CASPAX;
25335 break;
25337 Opcode = AArch64::CASPLX;
25338 break;
25341 Opcode = AArch64::CASPALX;
25342 break;
25343 default:
25344 llvm_unreachable("Unexpected ordering!");
25345 }
25346
25347 MachineSDNode *CmpSwap = DAG.getMachineNode(
25348 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25349 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25350
25351 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25352 if (DAG.getDataLayout().isBigEndian())
25353 std::swap(SubReg1, SubReg2);
25354 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25355 SDValue(CmpSwap, 0));
25356 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25357 SDValue(CmpSwap, 0));
25358 Results.push_back(
25359 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25360 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25361 return;
25362 }
25363
25364 unsigned Opcode;
25365 switch (MemOp->getMergedOrdering()) {
25367 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25368 break;
25370 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25371 break;
25373 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25374 break;
25377 Opcode = AArch64::CMP_SWAP_128;
25378 break;
25379 default:
25380 llvm_unreachable("Unexpected ordering!");
25381 }
25382
25383 SDLoc DL(N);
25384 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25385 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25386 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25387 New.first, New.second, N->getOperand(0)};
25388 SDNode *CmpSwap = DAG.getMachineNode(
25389 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25390 Ops);
25391 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25392
25393 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25394 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25395 Results.push_back(SDValue(CmpSwap, 3));
25396}
25397
25398static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25399 AtomicOrdering Ordering) {
25400 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25401 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25402 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25403 // ATOMIC_LOAD_CLR at any point.
25404 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25405 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25406 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25407 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25408
25409 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25410 // The operand will need to be XORed in a separate step.
25411 switch (Ordering) {
25413 return AArch64::LDCLRP;
25414 break;
25416 return AArch64::LDCLRPA;
25417 break;
25419 return AArch64::LDCLRPL;
25420 break;
25423 return AArch64::LDCLRPAL;
25424 break;
25425 default:
25426 llvm_unreachable("Unexpected ordering!");
25427 }
25428 }
25429
25430 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25431 switch (Ordering) {
25433 return AArch64::LDSETP;
25434 break;
25436 return AArch64::LDSETPA;
25437 break;
25439 return AArch64::LDSETPL;
25440 break;
25443 return AArch64::LDSETPAL;
25444 break;
25445 default:
25446 llvm_unreachable("Unexpected ordering!");
25447 }
25448 }
25449
25450 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25451 switch (Ordering) {
25453 return AArch64::SWPP;
25454 break;
25456 return AArch64::SWPPA;
25457 break;
25459 return AArch64::SWPPL;
25460 break;
25463 return AArch64::SWPPAL;
25464 break;
25465 default:
25466 llvm_unreachable("Unexpected ordering!");
25467 }
25468 }
25469
25470 llvm_unreachable("Unexpected ISDOpcode!");
25471}
25472
25475 SelectionDAG &DAG,
25476 const AArch64Subtarget *Subtarget) {
25477 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25478 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25479 // rather than the CASP instructions, because CASP has register classes for
25480 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25481 // to present them as single operands. LSE128 instructions use the GPR64
25482 // register class (because the pair does not have to be sequential), like
25483 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25484
25485 assert(N->getValueType(0) == MVT::i128 &&
25486 "AtomicLoadXXX on types less than 128 should be legal");
25487
25488 if (!Subtarget->hasLSE128())
25489 return;
25490
25491 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25492 const SDValue &Chain = N->getOperand(0);
25493 const SDValue &Ptr = N->getOperand(1);
25494 const SDValue &Val128 = N->getOperand(2);
25495 std::pair<SDValue, SDValue> Val2x64 =
25496 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25497
25498 const unsigned ISDOpcode = N->getOpcode();
25499 const unsigned MachineOpcode =
25500 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25501
25502 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25503 SDLoc dl(Val128);
25504 Val2x64.first =
25505 DAG.getNode(ISD::XOR, dl, MVT::i64,
25506 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25507 Val2x64.second =
25508 DAG.getNode(ISD::XOR, dl, MVT::i64,
25509 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25510 }
25511
25512 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25513 if (DAG.getDataLayout().isBigEndian())
25514 std::swap(Ops[0], Ops[1]);
25515
25516 MachineSDNode *AtomicInst =
25517 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25518 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25519
25520 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25521
25522 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25523 if (DAG.getDataLayout().isBigEndian())
25524 std::swap(Lo, Hi);
25525
25526 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25527 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25528}
25529
25530void AArch64TargetLowering::ReplaceNodeResults(
25532 switch (N->getOpcode()) {
25533 default:
25534 llvm_unreachable("Don't know how to custom expand this");
25535 case ISD::BITCAST:
25536 ReplaceBITCASTResults(N, Results, DAG);
25537 return;
25538 case ISD::VECREDUCE_ADD:
25543 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25544 return;
25545 case ISD::ADD:
25546 case ISD::FADD:
25547 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25548 return;
25549
25550 case ISD::CTPOP:
25551 case ISD::PARITY:
25552 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25553 Results.push_back(Result);
25554 return;
25555 case AArch64ISD::SADDV:
25557 return;
25558 case AArch64ISD::UADDV:
25560 return;
25561 case AArch64ISD::SMINV:
25563 return;
25564 case AArch64ISD::UMINV:
25566 return;
25567 case AArch64ISD::SMAXV:
25569 return;
25570 case AArch64ISD::UMAXV:
25572 return;
25573 case ISD::MULHS:
25575 Results.push_back(
25576 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25577 return;
25578 case ISD::MULHU:
25580 Results.push_back(
25581 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25582 return;
25583 case ISD::FP_TO_UINT:
25584 case ISD::FP_TO_SINT:
25587 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25588 // Let normal code take care of it by not adding anything to Results.
25589 return;
25591 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25592 return;
25594 assert(N->getValueType(0) != MVT::i128 &&
25595 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25596 break;
25599 case ISD::ATOMIC_SWAP: {
25600 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25601 "Expected 128-bit atomicrmw.");
25602 // These need custom type legalisation so we go directly to instruction.
25603 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25604 return;
25605 }
25606 case ISD::ATOMIC_LOAD:
25607 case ISD::LOAD: {
25608 MemSDNode *LoadNode = cast<MemSDNode>(N);
25609 EVT MemVT = LoadNode->getMemoryVT();
25610 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25611 // targets.
25612 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25613 MemVT.getSizeInBits() == 256u &&
25614 (MemVT.getScalarSizeInBits() == 8u ||
25615 MemVT.getScalarSizeInBits() == 16u ||
25616 MemVT.getScalarSizeInBits() == 32u ||
25617 MemVT.getScalarSizeInBits() == 64u)) {
25618
25621 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25622 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25623 MVT::Other}),
25624 {LoadNode->getChain(), LoadNode->getBasePtr()},
25625 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25626
25627 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25628 Result.getValue(0), Result.getValue(1));
25629 Results.append({Pair, Result.getValue(2) /* Chain */});
25630 return;
25631 }
25632
25633 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25634 LoadNode->getMemoryVT() != MVT::i128) {
25635 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25636 // optimizer.
25637 return;
25638 }
25639
25640 if (SDValue(N, 0).getValueType() == MVT::i128) {
25641 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25642 bool isLoadAcquire =
25644 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25645
25646 if (isLoadAcquire)
25647 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25648
25650 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25651 {LoadNode->getChain(), LoadNode->getBasePtr()},
25652 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25653
25654 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25655
25656 SDValue Pair =
25657 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25658 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25659 Results.append({Pair, Result.getValue(2) /* Chain */});
25660 }
25661 return;
25662 }
25664 ReplaceExtractSubVectorResults(N, Results, DAG);
25665 return;
25668 // Custom lowering has been requested for INSERT_SUBVECTOR and
25669 // CONCAT_VECTORS -- but delegate to common code for result type
25670 // legalisation
25671 return;
25673 EVT VT = N->getValueType(0);
25674
25675 Intrinsic::ID IntID =
25676 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25677 switch (IntID) {
25678 default:
25679 return;
25680 case Intrinsic::aarch64_sve_clasta_n: {
25681 assert((VT == MVT::i8 || VT == MVT::i16) &&
25682 "custom lowering for unexpected type");
25683 SDLoc DL(N);
25684 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25685 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25686 N->getOperand(1), Op2, N->getOperand(3));
25687 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25688 return;
25689 }
25690 case Intrinsic::aarch64_sve_clastb_n: {
25691 assert((VT == MVT::i8 || VT == MVT::i16) &&
25692 "custom lowering for unexpected type");
25693 SDLoc DL(N);
25694 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25695 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25696 N->getOperand(1), Op2, N->getOperand(3));
25697 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25698 return;
25699 }
25700 case Intrinsic::aarch64_sve_lasta: {
25701 assert((VT == MVT::i8 || VT == MVT::i16) &&
25702 "custom lowering for unexpected type");
25703 SDLoc DL(N);
25704 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25705 N->getOperand(1), N->getOperand(2));
25706 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25707 return;
25708 }
25709 case Intrinsic::aarch64_sve_lastb: {
25710 assert((VT == MVT::i8 || VT == MVT::i16) &&
25711 "custom lowering for unexpected type");
25712 SDLoc DL(N);
25713 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25714 N->getOperand(1), N->getOperand(2));
25715 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25716 return;
25717 }
25718 case Intrinsic::get_active_lane_mask: {
25719 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
25720 return;
25721
25722 // NOTE: Only trivial type promotion is supported.
25723 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
25724 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
25725 return;
25726
25727 SDLoc DL(N);
25728 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
25729 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25730 return;
25731 }
25732 }
25733 }
25734 case ISD::READ_REGISTER: {
25735 SDLoc DL(N);
25736 assert(N->getValueType(0) == MVT::i128 &&
25737 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25738 SDValue Chain = N->getOperand(0);
25739 SDValue SysRegName = N->getOperand(1);
25740
25741 SDValue Result = DAG.getNode(
25742 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25743 Chain, SysRegName);
25744
25745 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25746 // of the 128-bit System Register value.
25747 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25748 Result.getValue(0), Result.getValue(1));
25749 Results.push_back(Pair);
25750 Results.push_back(Result.getValue(2)); // Chain
25751 return;
25752 }
25753 }
25754}
25755
25757 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25759 return true;
25760}
25761
25762unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25763 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25764 // reciprocal if there are three or more FDIVs.
25765 return 3;
25766}
25767
25770 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25771 // v4i16, v2i32 instead of to promote.
25772 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25773 VT == MVT::v1f32)
25774 return TypeWidenVector;
25775
25777}
25778
25779// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25780// provided the address is 16-byte aligned.
25782 if (!Subtarget->hasLSE2())
25783 return false;
25784
25785 if (auto LI = dyn_cast<LoadInst>(I))
25786 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25787 LI->getAlign() >= Align(16);
25788
25789 if (auto SI = dyn_cast<StoreInst>(I))
25790 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25791 SI->getAlign() >= Align(16);
25792
25793 return false;
25794}
25795
25797 if (!Subtarget->hasLSE128())
25798 return false;
25799
25800 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25801 // will clobber the two registers.
25802 if (const auto *SI = dyn_cast<StoreInst>(I))
25803 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25804 SI->getAlign() >= Align(16) &&
25805 (SI->getOrdering() == AtomicOrdering::Release ||
25806 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25807
25808 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25809 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25810 RMW->getAlign() >= Align(16) &&
25811 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25812 RMW->getOperation() == AtomicRMWInst::And ||
25813 RMW->getOperation() == AtomicRMWInst::Or);
25814
25815 return false;
25816}
25817
25819 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25820 return false;
25821
25822 if (auto LI = dyn_cast<LoadInst>(I))
25823 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25824 LI->getAlign() >= Align(16) &&
25825 LI->getOrdering() == AtomicOrdering::Acquire;
25826
25827 if (auto SI = dyn_cast<StoreInst>(I))
25828 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25829 SI->getAlign() >= Align(16) &&
25830 SI->getOrdering() == AtomicOrdering::Release;
25831
25832 return false;
25833}
25834
25836 const Instruction *I) const {
25838 return false;
25840 return false;
25842 return true;
25843 return false;
25844}
25845
25847 const Instruction *I) const {
25848 // Store-Release instructions only provide seq_cst guarantees when paired with
25849 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25850 // implement seq_cst loads and stores, so we need additional explicit fences
25851 // after memory writes.
25852 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25853 return false;
25854
25855 switch (I->getOpcode()) {
25856 default:
25857 return false;
25858 case Instruction::AtomicCmpXchg:
25859 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25861 case Instruction::AtomicRMW:
25862 return cast<AtomicRMWInst>(I)->getOrdering() ==
25864 case Instruction::Store:
25865 return cast<StoreInst>(I)->getOrdering() ==
25867 }
25868}
25869
25870// Loads and stores less than 128-bits are already atomic; ones above that
25871// are doomed anyway, so defer to the default libcall and blame the OS when
25872// things go wrong.
25875 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25876 if (Size != 128)
25878 if (isOpSuitableForRCPC3(SI))
25880 if (isOpSuitableForLSE128(SI))
25882 if (isOpSuitableForLDPSTP(SI))
25885}
25886
25887// Loads and stores less than 128-bits are already atomic; ones above that
25888// are doomed anyway, so defer to the default libcall and blame the OS when
25889// things go wrong.
25892 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25893
25894 if (Size != 128)
25896 if (isOpSuitableForRCPC3(LI))
25898 // No LSE128 loads
25899 if (isOpSuitableForLDPSTP(LI))
25901
25902 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25903 // implement atomicrmw without spilling. If the target address is also on the
25904 // stack and close enough to the spill slot, this can lead to a situation
25905 // where the monitor always gets cleared and the atomic operation can never
25906 // succeed. So at -O0 lower this operation to a CAS loop.
25907 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25909
25910 // Using CAS for an atomic load has a better chance of succeeding under high
25911 // contention situations. So use it if available.
25912 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25914}
25915
25916// The "default" for integer RMW operations is to expand to an LL/SC loop.
25917// However, with the LSE instructions (or outline-atomics mode, which provides
25918// library routines in place of the LSE-instructions), we can directly emit many
25919// operations instead.
25920//
25921// Floating-point operations are always emitted to a cmpxchg loop, because they
25922// may trigger a trap which aborts an LLSC sequence.
25925 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25926 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25927
25928 if (AI->isFloatingPointOperation())
25930
25931 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25935 if (CanUseLSE128)
25937
25938 // Nand is not supported in LSE.
25939 // Leave 128 bits to LLSC or CmpXChg.
25940 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25941 if (Subtarget->hasLSE())
25943 if (Subtarget->outlineAtomics()) {
25944 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25945 // Don't outline them unless
25946 // (1) high level <atomic> support approved:
25947 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25948 // (2) low level libgcc and compiler-rt support implemented by:
25949 // min/max outline atomics helpers
25950 if (AI->getOperation() != AtomicRMWInst::Min &&
25955 }
25956 }
25957 }
25958
25959 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25960 // implement atomicrmw without spilling. If the target address is also on the
25961 // stack and close enough to the spill slot, this can lead to a situation
25962 // where the monitor always gets cleared and the atomic operation can never
25963 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25964 // we have a single CAS instruction that can replace the loop.
25966 Subtarget->hasLSE())
25968
25970}
25971
25974 AtomicCmpXchgInst *AI) const {
25975 // If subtarget has LSE, leave cmpxchg intact for codegen.
25976 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25978 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25979 // implement cmpxchg without spilling. If the address being exchanged is also
25980 // on the stack and close enough to the spill slot, this can lead to a
25981 // situation where the monitor always gets cleared and the atomic operation
25982 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25983 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25985
25986 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25987 // it.
25989 if (Size > 64)
25991
25993}
25994
25996 Type *ValueTy, Value *Addr,
25997 AtomicOrdering Ord) const {
25998 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25999 bool IsAcquire = isAcquireOrStronger(Ord);
26000
26001 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26002 // intrinsic must return {i64, i64} and we have to recombine them into a
26003 // single i128 here.
26004 if (ValueTy->getPrimitiveSizeInBits() == 128) {
26006 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26008
26009 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
26010
26011 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
26012 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
26013 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
26014 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
26015 return Builder.CreateOr(
26016 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
26017 }
26018
26019 Type *Tys[] = { Addr->getType() };
26021 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26022 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
26023
26024 const DataLayout &DL = M->getDataLayout();
26025 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
26026 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
26027 CI->addParamAttr(
26028 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
26029 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
26030
26031 return Builder.CreateBitCast(Trunc, ValueTy);
26032}
26033
26035 IRBuilderBase &Builder) const {
26036 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26037 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
26038}
26039
26041 Value *Val, Value *Addr,
26042 AtomicOrdering Ord) const {
26043 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26044 bool IsRelease = isReleaseOrStronger(Ord);
26045
26046 // Since the intrinsics must have legal type, the i128 intrinsics take two
26047 // parameters: "i64, i64". We must marshal Val into the appropriate form
26048 // before the call.
26049 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26051 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26053 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26054
26055 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26056 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26057 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26058 }
26059
26061 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26062 Type *Tys[] = { Addr->getType() };
26063 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26064
26065 const DataLayout &DL = M->getDataLayout();
26066 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26067 Val = Builder.CreateBitCast(Val, IntValTy);
26068
26069 CallInst *CI = Builder.CreateCall(
26070 Stxr, {Builder.CreateZExtOrBitCast(
26071 Val, Stxr->getFunctionType()->getParamType(0)),
26072 Addr});
26073 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26074 Attribute::ElementType, Val->getType()));
26075 return CI;
26076}
26077
26079 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26080 const DataLayout &DL) const {
26081 if (!Ty->isArrayTy()) {
26082 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26083 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26084 }
26085
26086 // All non aggregate members of the type must have the same type
26087 SmallVector<EVT> ValueVTs;
26088 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26089 return all_equal(ValueVTs);
26090}
26091
26092bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26093 EVT) const {
26094 return false;
26095}
26096
26097static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26098 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26099 Function *ThreadPointerFunc =
26100 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26101 return IRB.CreatePointerCast(
26102 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26103 Offset),
26104 IRB.getPtrTy(0));
26105}
26106
26108 // Android provides a fixed TLS slot for the stack cookie. See the definition
26109 // of TLS_SLOT_STACK_GUARD in
26110 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26111 if (Subtarget->isTargetAndroid())
26112 return UseTlsOffset(IRB, 0x28);
26113
26114 // Fuchsia is similar.
26115 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26116 if (Subtarget->isTargetFuchsia())
26117 return UseTlsOffset(IRB, -0x10);
26118
26120}
26121
26123 // MSVC CRT provides functionalities for stack protection.
26124 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26125 // MSVC CRT has a global variable holding security cookie.
26126 M.getOrInsertGlobal("__security_cookie",
26127 PointerType::getUnqual(M.getContext()));
26128
26129 // MSVC CRT has a function to validate security cookie.
26130 FunctionCallee SecurityCheckCookie =
26131 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26132 Type::getVoidTy(M.getContext()),
26133 PointerType::getUnqual(M.getContext()));
26134 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26135 F->setCallingConv(CallingConv::Win64);
26136 F->addParamAttr(0, Attribute::AttrKind::InReg);
26137 }
26138 return;
26139 }
26141}
26142
26144 // MSVC CRT has a global variable holding security cookie.
26145 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26146 return M.getGlobalVariable("__security_cookie");
26148}
26149
26151 // MSVC CRT has a function to validate security cookie.
26152 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26153 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26155}
26156
26157Value *
26159 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26160 // definition of TLS_SLOT_SAFESTACK in
26161 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26162 if (Subtarget->isTargetAndroid())
26163 return UseTlsOffset(IRB, 0x48);
26164
26165 // Fuchsia is similar.
26166 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26167 if (Subtarget->isTargetFuchsia())
26168 return UseTlsOffset(IRB, -0x8);
26169
26171}
26172
26174 const Instruction &AndI) const {
26175 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26176 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26177 // may be beneficial to sink in other cases, but we would have to check that
26178 // the cmp would not get folded into the br to form a cbz for these to be
26179 // beneficial.
26180 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26181 if (!Mask)
26182 return false;
26183 return Mask->getValue().isPowerOf2();
26184}
26185
26189 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26190 SelectionDAG &DAG) const {
26191 // Does baseline recommend not to perform the fold by default?
26193 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26194 return false;
26195 // Else, if this is a vector shift, prefer 'shl'.
26196 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26197}
26198
26201 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26203 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26206 ExpansionFactor);
26207}
26208
26210 // Update IsSplitCSR in AArch64unctionInfo.
26211 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26212 AFI->setIsSplitCSR(true);
26213}
26214
26216 MachineBasicBlock *Entry,
26217 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26218 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26219 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26220 if (!IStart)
26221 return;
26222
26223 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26224 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26225 MachineBasicBlock::iterator MBBI = Entry->begin();
26226 for (const MCPhysReg *I = IStart; *I; ++I) {
26227 const TargetRegisterClass *RC = nullptr;
26228 if (AArch64::GPR64RegClass.contains(*I))
26229 RC = &AArch64::GPR64RegClass;
26230 else if (AArch64::FPR64RegClass.contains(*I))
26231 RC = &AArch64::FPR64RegClass;
26232 else
26233 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26234
26235 Register NewVR = MRI->createVirtualRegister(RC);
26236 // Create copy from CSR to a virtual register.
26237 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26238 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26239 // nounwind. If we want to generalize this later, we may need to emit
26240 // CFI pseudo-instructions.
26241 assert(Entry->getParent()->getFunction().hasFnAttribute(
26242 Attribute::NoUnwind) &&
26243 "Function should be nounwind in insertCopiesSplitCSR!");
26244 Entry->addLiveIn(*I);
26245 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26246 .addReg(*I);
26247
26248 // Insert the copy-back instructions right before the terminator.
26249 for (auto *Exit : Exits)
26250 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26251 TII->get(TargetOpcode::COPY), *I)
26252 .addReg(NewVR);
26253 }
26254}
26255
26257 // Integer division on AArch64 is expensive. However, when aggressively
26258 // optimizing for code size, we prefer to use a div instruction, as it is
26259 // usually smaller than the alternative sequence.
26260 // The exception to this is vector division. Since AArch64 doesn't have vector
26261 // integer division, leaving the division as-is is a loss even in terms of
26262 // size, because it will have to be scalarized, while the alternative code
26263 // sequence can be performed in vector form.
26264 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26265 return OptSize && !VT.isVector();
26266}
26267
26269 // We want inc-of-add for scalars and sub-of-not for vectors.
26270 return VT.isScalarInteger();
26271}
26272
26274 EVT VT) const {
26275 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26276 // legalize.
26277 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26278 return false;
26279 if (FPVT == MVT::v8bf16)
26280 return false;
26281 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26282}
26283
26287 const TargetInstrInfo *TII) const {
26288 assert(MBBI->isCall() && MBBI->getCFIType() &&
26289 "Invalid call instruction for a KCFI check");
26290
26291 switch (MBBI->getOpcode()) {
26292 case AArch64::BLR:
26293 case AArch64::BLRNoIP:
26294 case AArch64::TCRETURNri:
26295 case AArch64::TCRETURNrix16x17:
26296 case AArch64::TCRETURNrix17:
26297 case AArch64::TCRETURNrinotx16:
26298 break;
26299 default:
26300 llvm_unreachable("Unexpected CFI call opcode");
26301 }
26302
26303 MachineOperand &Target = MBBI->getOperand(0);
26304 assert(Target.isReg() && "Invalid target operand for an indirect call");
26305 Target.setIsRenamable(false);
26306
26307 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26308 .addReg(Target.getReg())
26309 .addImm(MBBI->getCFIType())
26310 .getInstr();
26311}
26312
26314 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26315}
26316
26317unsigned
26319 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26320 return getPointerTy(DL).getSizeInBits();
26321
26322 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26323}
26324
26325void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26326 MachineFrameInfo &MFI = MF.getFrameInfo();
26327 // If we have any vulnerable SVE stack objects then the stack protector
26328 // needs to be placed at the top of the SVE stack area, as the SVE locals
26329 // are placed above the other locals, so we allocate it as if it were a
26330 // scalable vector.
26331 // FIXME: It may be worthwhile having a specific interface for this rather
26332 // than doing it here in finalizeLowering.
26333 if (MFI.hasStackProtectorIndex()) {
26334 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26340 break;
26341 }
26342 }
26343 }
26346}
26347
26348// Unlike X86, we let frame lowering assign offsets to all catch objects.
26350 return false;
26351}
26352
26353bool AArch64TargetLowering::shouldLocalize(
26354 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26355 auto &MF = *MI.getMF();
26356 auto &MRI = MF.getRegInfo();
26357 auto maxUses = [](unsigned RematCost) {
26358 // A cost of 1 means remats are basically free.
26359 if (RematCost == 1)
26360 return std::numeric_limits<unsigned>::max();
26361 if (RematCost == 2)
26362 return 2U;
26363
26364 // Remat is too expensive, only sink if there's one user.
26365 if (RematCost > 2)
26366 return 1U;
26367 llvm_unreachable("Unexpected remat cost");
26368 };
26369
26370 unsigned Opc = MI.getOpcode();
26371 switch (Opc) {
26372 case TargetOpcode::G_GLOBAL_VALUE: {
26373 // On Darwin, TLS global vars get selected into function calls, which
26374 // we don't want localized, as they can get moved into the middle of a
26375 // another call sequence.
26376 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26377 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26378 return false;
26379 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26380 }
26381 case TargetOpcode::G_FCONSTANT:
26382 case TargetOpcode::G_CONSTANT: {
26383 const ConstantInt *CI;
26384 unsigned AdditionalCost = 0;
26385
26386 if (Opc == TargetOpcode::G_CONSTANT)
26387 CI = MI.getOperand(1).getCImm();
26388 else {
26389 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26390 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26391 // materialized as integers.
26392 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26393 break;
26394 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26395 bool OptForSize =
26398 OptForSize))
26399 return true; // Constant should be cheap.
26400 CI =
26401 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26402 // FP materialization also costs an extra move, from gpr to fpr.
26403 AdditionalCost = 1;
26404 }
26405 APInt Imm = CI->getValue();
26408 assert(Cost.isValid() && "Expected a valid imm cost");
26409
26410 unsigned RematCost = *Cost.getValue();
26411 RematCost += AdditionalCost;
26412 Register Reg = MI.getOperand(0).getReg();
26413 unsigned MaxUses = maxUses(RematCost);
26414 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26415 if (MaxUses == std::numeric_limits<unsigned>::max())
26416 --MaxUses;
26417 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26418 }
26419 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26420 // localizable.
26421 case AArch64::ADRP:
26422 case AArch64::G_ADD_LOW:
26423 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26424 case TargetOpcode::G_PTR_ADD:
26425 return true;
26426 default:
26427 break;
26428 }
26430}
26431
26433 if (Inst.getType()->isScalableTy())
26434 return true;
26435
26436 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26437 if (Inst.getOperand(i)->getType()->isScalableTy())
26438 return true;
26439
26440 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26441 if (AI->getAllocatedType()->isScalableTy())
26442 return true;
26443 }
26444
26445 // Checks to allow the use of SME instructions
26446 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26447 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26448 auto CalleeAttrs = SMEAttrs(*Base);
26449 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26450 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26451 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26452 return true;
26453 }
26454 return false;
26455}
26456
26457// Return the largest legal scalable vector type that matches VT's element type.
26461 "Expected legal fixed length vector!");
26462 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26463 default:
26464 llvm_unreachable("unexpected element type for SVE container");
26465 case MVT::i8:
26466 return EVT(MVT::nxv16i8);
26467 case MVT::i16:
26468 return EVT(MVT::nxv8i16);
26469 case MVT::i32:
26470 return EVT(MVT::nxv4i32);
26471 case MVT::i64:
26472 return EVT(MVT::nxv2i64);
26473 case MVT::bf16:
26474 return EVT(MVT::nxv8bf16);
26475 case MVT::f16:
26476 return EVT(MVT::nxv8f16);
26477 case MVT::f32:
26478 return EVT(MVT::nxv4f32);
26479 case MVT::f64:
26480 return EVT(MVT::nxv2f64);
26481 }
26482}
26483
26484// Return a PTRUE with active lanes corresponding to the extent of VT.
26486 EVT VT) {
26489 "Expected legal fixed length vector!");
26490
26491 std::optional<unsigned> PgPattern =
26493 assert(PgPattern && "Unexpected element count for SVE predicate");
26494
26495 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26496 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26497 // variants of instructions when available.
26498 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26499 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26500 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26501 if (MaxSVESize && MinSVESize == MaxSVESize &&
26502 MaxSVESize == VT.getSizeInBits())
26503 PgPattern = AArch64SVEPredPattern::all;
26504
26505 MVT MaskVT;
26506 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26507 default:
26508 llvm_unreachable("unexpected element type for SVE predicate");
26509 case MVT::i8:
26510 MaskVT = MVT::nxv16i1;
26511 break;
26512 case MVT::i16:
26513 case MVT::f16:
26514 case MVT::bf16:
26515 MaskVT = MVT::nxv8i1;
26516 break;
26517 case MVT::i32:
26518 case MVT::f32:
26519 MaskVT = MVT::nxv4i1;
26520 break;
26521 case MVT::i64:
26522 case MVT::f64:
26523 MaskVT = MVT::nxv2i1;
26524 break;
26525 }
26526
26527 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26528}
26529
26531 EVT VT) {
26533 "Expected legal scalable vector!");
26534 auto PredTy = VT.changeVectorElementType(MVT::i1);
26535 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26536}
26537
26539 if (VT.isFixedLengthVector())
26540 return getPredicateForFixedLengthVector(DAG, DL, VT);
26541
26542 return getPredicateForScalableVector(DAG, DL, VT);
26543}
26544
26545// Grow V to consume an entire SVE register.
26547 assert(VT.isScalableVector() &&
26548 "Expected to convert into a scalable vector!");
26549 assert(V.getValueType().isFixedLengthVector() &&
26550 "Expected a fixed length vector operand!");
26551 SDLoc DL(V);
26552 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26553 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26554}
26555
26556// Shrink V so it's just big enough to maintain a VT's worth of data.
26559 "Expected to convert into a fixed length vector!");
26560 assert(V.getValueType().isScalableVector() &&
26561 "Expected a scalable vector operand!");
26562 SDLoc DL(V);
26563 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26564 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26565}
26566
26567// Convert all fixed length vector loads larger than NEON to masked_loads.
26568SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26569 SDValue Op, SelectionDAG &DAG) const {
26570 auto Load = cast<LoadSDNode>(Op);
26571
26572 SDLoc DL(Op);
26573 EVT VT = Op.getValueType();
26574 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26575 EVT LoadVT = ContainerVT;
26576 EVT MemVT = Load->getMemoryVT();
26577
26578 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26579
26580 if (VT.isFloatingPoint()) {
26581 LoadVT = ContainerVT.changeTypeToInteger();
26582 MemVT = MemVT.changeTypeToInteger();
26583 }
26584
26585 SDValue NewLoad = DAG.getMaskedLoad(
26586 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26587 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26588 Load->getAddressingMode(), Load->getExtensionType());
26589
26590 SDValue Result = NewLoad;
26591 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26592 EVT ExtendVT = ContainerVT.changeVectorElementType(
26593 Load->getMemoryVT().getVectorElementType());
26594
26595 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26597 Pg, Result, DAG.getUNDEF(ContainerVT));
26598 } else if (VT.isFloatingPoint()) {
26599 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26600 }
26601
26602 Result = convertFromScalableVector(DAG, VT, Result);
26603 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26604 return DAG.getMergeValues(MergedValues, DL);
26605}
26606
26608 SelectionDAG &DAG) {
26609 SDLoc DL(Mask);
26610 EVT InVT = Mask.getValueType();
26611 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26612
26613 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26614
26615 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26616 return Pg;
26617
26618 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26619 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26620
26622 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26623}
26624
26625// Convert all fixed length vector loads larger than NEON to masked_loads.
26626SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26627 SDValue Op, SelectionDAG &DAG) const {
26628 auto Load = cast<MaskedLoadSDNode>(Op);
26629
26630 SDLoc DL(Op);
26631 EVT VT = Op.getValueType();
26632 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26633
26634 SDValue Mask = Load->getMask();
26635 // If this is an extending load and the mask type is not the same as
26636 // load's type then we have to extend the mask type.
26637 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26638 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26639 "Incorrect mask type");
26640 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26641 }
26643
26644 SDValue PassThru;
26645 bool IsPassThruZeroOrUndef = false;
26646
26647 if (Load->getPassThru()->isUndef()) {
26648 PassThru = DAG.getUNDEF(ContainerVT);
26649 IsPassThruZeroOrUndef = true;
26650 } else {
26651 if (ContainerVT.isInteger())
26652 PassThru = DAG.getConstant(0, DL, ContainerVT);
26653 else
26654 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26655 if (isZerosVector(Load->getPassThru().getNode()))
26656 IsPassThruZeroOrUndef = true;
26657 }
26658
26659 SDValue NewLoad = DAG.getMaskedLoad(
26660 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26661 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26662 Load->getAddressingMode(), Load->getExtensionType());
26663
26664 SDValue Result = NewLoad;
26665 if (!IsPassThruZeroOrUndef) {
26666 SDValue OldPassThru =
26667 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26668 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26669 }
26670
26671 Result = convertFromScalableVector(DAG, VT, Result);
26672 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26673 return DAG.getMergeValues(MergedValues, DL);
26674}
26675
26676// Convert all fixed length vector stores larger than NEON to masked_stores.
26677SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26678 SDValue Op, SelectionDAG &DAG) const {
26679 auto Store = cast<StoreSDNode>(Op);
26680
26681 SDLoc DL(Op);
26682 EVT VT = Store->getValue().getValueType();
26683 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26684 EVT MemVT = Store->getMemoryVT();
26685
26686 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26687 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26688
26689 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26690 EVT TruncVT = ContainerVT.changeVectorElementType(
26691 Store->getMemoryVT().getVectorElementType());
26692 MemVT = MemVT.changeTypeToInteger();
26693 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26694 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26695 DAG.getUNDEF(TruncVT));
26696 NewValue =
26697 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26698 } else if (VT.isFloatingPoint()) {
26699 MemVT = MemVT.changeTypeToInteger();
26700 NewValue =
26701 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26702 }
26703
26704 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26705 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26706 Store->getMemOperand(), Store->getAddressingMode(),
26707 Store->isTruncatingStore());
26708}
26709
26710SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26711 SDValue Op, SelectionDAG &DAG) const {
26712 auto *Store = cast<MaskedStoreSDNode>(Op);
26713
26714 SDLoc DL(Op);
26715 EVT VT = Store->getValue().getValueType();
26716 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26717
26718 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26720
26721 return DAG.getMaskedStore(
26722 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26723 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26724 Store->getAddressingMode(), Store->isTruncatingStore());
26725}
26726
26727SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26728 SDValue Op, SelectionDAG &DAG) const {
26729 SDLoc dl(Op);
26730 EVT VT = Op.getValueType();
26731 EVT EltVT = VT.getVectorElementType();
26732
26733 bool Signed = Op.getOpcode() == ISD::SDIV;
26734 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26735
26736 bool Negated;
26737 uint64_t SplatVal;
26738 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26739 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26740 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26741 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26742
26743 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26744 SDValue Res =
26745 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26746 if (Negated)
26747 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26748 DAG.getConstant(0, dl, ContainerVT), Res);
26749
26750 return convertFromScalableVector(DAG, VT, Res);
26751 }
26752
26753 // Scalable vector i32/i64 DIV is supported.
26754 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26755 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26756
26757 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26758 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26759 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26760 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26761
26762 // If the wider type is legal: extend, op, and truncate.
26763 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26764 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26765 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26766 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26767 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26768 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26769 }
26770
26771 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26772 &ExtendOpcode](SDValue Op) {
26773 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26774 SDValue IdxHalf =
26775 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26776 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26777 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26778 return std::pair<SDValue, SDValue>(
26779 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26780 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26781 };
26782
26783 // If wider type is not legal: split, extend, op, trunc and concat.
26784 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26785 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26786 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26787 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26788 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26789 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26790 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26791}
26792
26793SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26794 SDValue Op, SelectionDAG &DAG) const {
26795 EVT VT = Op.getValueType();
26796 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26797
26798 SDLoc DL(Op);
26799 SDValue Val = Op.getOperand(0);
26800 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26801 Val = convertToScalableVector(DAG, ContainerVT, Val);
26802
26803 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26804 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26805
26806 // Repeatedly unpack Val until the result is of the desired element type.
26807 switch (ContainerVT.getSimpleVT().SimpleTy) {
26808 default:
26809 llvm_unreachable("unimplemented container type");
26810 case MVT::nxv16i8:
26811 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26812 if (VT.getVectorElementType() == MVT::i16)
26813 break;
26814 [[fallthrough]];
26815 case MVT::nxv8i16:
26816 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26817 if (VT.getVectorElementType() == MVT::i32)
26818 break;
26819 [[fallthrough]];
26820 case MVT::nxv4i32:
26821 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26822 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26823 break;
26824 }
26825
26826 return convertFromScalableVector(DAG, VT, Val);
26827}
26828
26829SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26830 SDValue Op, SelectionDAG &DAG) const {
26831 EVT VT = Op.getValueType();
26832 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26833
26834 SDLoc DL(Op);
26835 SDValue Val = Op.getOperand(0);
26836 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26837 Val = convertToScalableVector(DAG, ContainerVT, Val);
26838
26839 // Repeatedly truncate Val until the result is of the desired element type.
26840 switch (ContainerVT.getSimpleVT().SimpleTy) {
26841 default:
26842 llvm_unreachable("unimplemented container type");
26843 case MVT::nxv2i64:
26844 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26845 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26846 if (VT.getVectorElementType() == MVT::i32)
26847 break;
26848 [[fallthrough]];
26849 case MVT::nxv4i32:
26850 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26851 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26852 if (VT.getVectorElementType() == MVT::i16)
26853 break;
26854 [[fallthrough]];
26855 case MVT::nxv8i16:
26856 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26857 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26858 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26859 break;
26860 }
26861
26862 return convertFromScalableVector(DAG, VT, Val);
26863}
26864
26865SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26866 SDValue Op, SelectionDAG &DAG) const {
26867 EVT VT = Op.getValueType();
26868 EVT InVT = Op.getOperand(0).getValueType();
26869 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26870
26871 SDLoc DL(Op);
26872 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26873 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26874
26875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26876}
26877
26878SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26879 SDValue Op, SelectionDAG &DAG) const {
26880 EVT VT = Op.getValueType();
26881 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26882
26883 SDLoc DL(Op);
26884 EVT InVT = Op.getOperand(0).getValueType();
26885 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26886 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26887
26888 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26889 Op.getOperand(1), Op.getOperand(2));
26890
26891 return convertFromScalableVector(DAG, VT, ScalableRes);
26892}
26893
26894// Convert vector operation 'Op' to an equivalent predicated operation whereby
26895// the original operation's type is used to construct a suitable predicate.
26896// NOTE: The results for inactive lanes are undefined.
26897SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26898 SelectionDAG &DAG,
26899 unsigned NewOp) const {
26900 EVT VT = Op.getValueType();
26901 SDLoc DL(Op);
26902 auto Pg = getPredicateForVector(DAG, DL, VT);
26903
26904 if (VT.isFixedLengthVector()) {
26905 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26906 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26907
26908 // Create list of operands by converting existing ones to scalable types.
26910 for (const SDValue &V : Op->op_values()) {
26911 if (isa<CondCodeSDNode>(V)) {
26912 Operands.push_back(V);
26913 continue;
26914 }
26915
26916 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26917 EVT VTArg = VTNode->getVT().getVectorElementType();
26918 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26919 Operands.push_back(DAG.getValueType(NewVTArg));
26920 continue;
26921 }
26922
26923 assert(isTypeLegal(V.getValueType()) &&
26924 "Expected only legal fixed-width types");
26925 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26926 }
26927
26928 if (isMergePassthruOpcode(NewOp))
26929 Operands.push_back(DAG.getUNDEF(ContainerVT));
26930
26931 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26932 return convertFromScalableVector(DAG, VT, ScalableRes);
26933 }
26934
26935 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26936
26938 for (const SDValue &V : Op->op_values()) {
26939 assert((!V.getValueType().isVector() ||
26940 V.getValueType().isScalableVector()) &&
26941 "Only scalable vectors are supported!");
26942 Operands.push_back(V);
26943 }
26944
26945 if (isMergePassthruOpcode(NewOp))
26946 Operands.push_back(DAG.getUNDEF(VT));
26947
26948 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26949}
26950
26951// If a fixed length vector operation has no side effects when applied to
26952// undefined elements, we can safely use scalable vectors to perform the same
26953// operation without needing to worry about predication.
26954SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26955 SelectionDAG &DAG) const {
26956 EVT VT = Op.getValueType();
26958 "Only expected to lower fixed length vector operation!");
26959 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26960
26961 // Create list of operands by converting existing ones to scalable types.
26963 for (const SDValue &V : Op->op_values()) {
26964 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26965
26966 // Pass through non-vector operands.
26967 if (!V.getValueType().isVector()) {
26968 Ops.push_back(V);
26969 continue;
26970 }
26971
26972 // "cast" fixed length vector to a scalable vector.
26973 assert(V.getValueType().isFixedLengthVector() &&
26974 isTypeLegal(V.getValueType()) &&
26975 "Only fixed length vectors are supported!");
26976 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26977 }
26978
26979 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26980 return convertFromScalableVector(DAG, VT, ScalableRes);
26981}
26982
26983SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26984 SelectionDAG &DAG) const {
26985 SDLoc DL(ScalarOp);
26986 SDValue AccOp = ScalarOp.getOperand(0);
26987 SDValue VecOp = ScalarOp.getOperand(1);
26988 EVT SrcVT = VecOp.getValueType();
26989 EVT ResVT = SrcVT.getVectorElementType();
26990
26991 EVT ContainerVT = SrcVT;
26992 if (SrcVT.isFixedLengthVector()) {
26993 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26994 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26995 }
26996
26997 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26998 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26999
27000 // Convert operands to Scalable.
27001 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
27002 DAG.getUNDEF(ContainerVT), AccOp, Zero);
27003
27004 // Perform reduction.
27005 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
27006 Pg, AccOp, VecOp);
27007
27008 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
27009}
27010
27011SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27012 SelectionDAG &DAG) const {
27013 SDLoc DL(ReduceOp);
27014 SDValue Op = ReduceOp.getOperand(0);
27015 EVT OpVT = Op.getValueType();
27016 EVT VT = ReduceOp.getValueType();
27017
27018 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
27019 return SDValue();
27020
27021 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
27022
27023 switch (ReduceOp.getOpcode()) {
27024 default:
27025 return SDValue();
27026 case ISD::VECREDUCE_OR:
27027 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
27028 // The predicate can be 'Op' because
27029 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27030 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
27031 else
27032 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
27033 case ISD::VECREDUCE_AND: {
27034 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
27035 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
27036 }
27037 case ISD::VECREDUCE_XOR: {
27038 SDValue ID =
27039 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
27040 if (OpVT == MVT::nxv1i1) {
27041 // Emulate a CNTP on .Q using .D and a different governing predicate.
27042 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
27043 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
27044 }
27045 SDValue Cntp =
27046 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
27047 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
27048 }
27049 }
27050
27051 return SDValue();
27052}
27053
27054SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27055 SDValue ScalarOp,
27056 SelectionDAG &DAG) const {
27057 SDLoc DL(ScalarOp);
27058 SDValue VecOp = ScalarOp.getOperand(0);
27059 EVT SrcVT = VecOp.getValueType();
27060
27062 SrcVT,
27063 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27064 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27065 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27066 }
27067
27068 // UADDV always returns an i64 result.
27069 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27070 SrcVT.getVectorElementType();
27071 EVT RdxVT = SrcVT;
27072 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27073 RdxVT = getPackedSVEVectorVT(ResVT);
27074
27075 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27076 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27077 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27078 Rdx, DAG.getConstant(0, DL, MVT::i64));
27079
27080 // The VEC_REDUCE nodes expect an element size result.
27081 if (ResVT != ScalarOp.getValueType())
27082 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27083
27084 return Res;
27085}
27086
27087SDValue
27088AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27089 SelectionDAG &DAG) const {
27090 EVT VT = Op.getValueType();
27091 SDLoc DL(Op);
27092
27093 EVT InVT = Op.getOperand(1).getValueType();
27094 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27095 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27096 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27097
27098 // Convert the mask to a predicated (NOTE: We don't need to worry about
27099 // inactive lanes since VSELECT is safe when given undefined elements).
27100 EVT MaskVT = Op.getOperand(0).getValueType();
27101 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27102 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27104 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27105
27106 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27107 Mask, Op1, Op2);
27108
27109 return convertFromScalableVector(DAG, VT, ScalableRes);
27110}
27111
27112SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27113 SDValue Op, SelectionDAG &DAG) const {
27114 SDLoc DL(Op);
27115 EVT InVT = Op.getOperand(0).getValueType();
27116 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27117
27118 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27119 "Only expected to lower fixed length vector operation!");
27120 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27121 "Expected integer result of the same bit length as the inputs!");
27122
27123 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27124 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27125 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27126
27127 EVT CmpVT = Pg.getValueType();
27128 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27129 {Pg, Op1, Op2, Op.getOperand(2)});
27130
27131 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27132 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27133 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27134}
27135
27136SDValue
27137AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27138 SelectionDAG &DAG) const {
27139 SDLoc DL(Op);
27140 auto SrcOp = Op.getOperand(0);
27141 EVT VT = Op.getValueType();
27142 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27143 EVT ContainerSrcVT =
27144 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27145
27146 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27147 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27148 return convertFromScalableVector(DAG, VT, Op);
27149}
27150
27151SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27152 SDValue Op, SelectionDAG &DAG) const {
27153 SDLoc DL(Op);
27154 unsigned NumOperands = Op->getNumOperands();
27155
27156 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27157 "Unexpected number of operands in CONCAT_VECTORS");
27158
27159 auto SrcOp1 = Op.getOperand(0);
27160 auto SrcOp2 = Op.getOperand(1);
27161 EVT VT = Op.getValueType();
27162 EVT SrcVT = SrcOp1.getValueType();
27163
27164 if (NumOperands > 2) {
27166 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27167 for (unsigned I = 0; I < NumOperands; I += 2)
27168 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27169 Op->getOperand(I), Op->getOperand(I + 1)));
27170
27171 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27172 }
27173
27174 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27175
27177 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27178 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27179
27180 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27181
27182 return convertFromScalableVector(DAG, VT, Op);
27183}
27184
27185SDValue
27186AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27187 SelectionDAG &DAG) const {
27188 EVT VT = Op.getValueType();
27189 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27190
27191 SDLoc DL(Op);
27192 SDValue Val = Op.getOperand(0);
27193 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27194 EVT SrcVT = Val.getValueType();
27195 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27196 EVT ExtendVT = ContainerVT.changeVectorElementType(
27197 SrcVT.getVectorElementType());
27198
27199 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27200 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27201
27202 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27203 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27204 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27205 Pg, Val, DAG.getUNDEF(ContainerVT));
27206
27207 return convertFromScalableVector(DAG, VT, Val);
27208}
27209
27210SDValue
27211AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27212 SelectionDAG &DAG) const {
27213 EVT VT = Op.getValueType();
27214 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27215
27216 SDLoc DL(Op);
27217 SDValue Val = Op.getOperand(0);
27218 EVT SrcVT = Val.getValueType();
27219 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27220 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27222 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27223
27224 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27225 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27226 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27227 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27228 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27229
27230 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27231 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27232}
27233
27234SDValue
27235AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27236 SelectionDAG &DAG) const {
27237 EVT VT = Op.getValueType();
27238 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27239
27240 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27241 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27243
27244 SDLoc DL(Op);
27245 SDValue Val = Op.getOperand(0);
27246 EVT SrcVT = Val.getValueType();
27247 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27248 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27249
27250 if (VT.bitsGE(SrcVT)) {
27252
27253 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27254 VT.changeTypeToInteger(), Val);
27255
27256 // Safe to use a larger than specified operand because by promoting the
27257 // value nothing has changed from an arithmetic point of view.
27258 Val =
27259 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27260 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27261 DAG.getUNDEF(ContainerDstVT));
27262 return convertFromScalableVector(DAG, VT, Val);
27263 } else {
27264 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27265 ContainerDstVT.getVectorElementType());
27267
27268 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27269 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27270 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27271 Val = convertFromScalableVector(DAG, SrcVT, Val);
27272
27273 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27274 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27275 }
27276}
27277
27278SDValue
27279AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27280 SelectionDAG &DAG) const {
27281 SDLoc DL(Op);
27282 EVT OpVT = Op.getValueType();
27283 assert(OpVT.isScalableVector() &&
27284 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27285 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27286 Op.getOperand(1));
27287 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27288 Op.getOperand(1));
27289 return DAG.getMergeValues({Even, Odd}, DL);
27290}
27291
27292SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27293 SelectionDAG &DAG) const {
27294 SDLoc DL(Op);
27295 EVT OpVT = Op.getValueType();
27296 assert(OpVT.isScalableVector() &&
27297 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27298
27299 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27300 Op.getOperand(1));
27301 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27302 Op.getOperand(1));
27303 return DAG.getMergeValues({Lo, Hi}, DL);
27304}
27305
27306SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27307 SelectionDAG &DAG) const {
27308 // FIXME: Maybe share some code with LowerMGather/Scatter?
27309 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
27310 SDLoc DL(HG);
27311 SDValue Chain = HG->getChain();
27312 SDValue Inc = HG->getInc();
27313 SDValue Mask = HG->getMask();
27314 SDValue Ptr = HG->getBasePtr();
27315 SDValue Index = HG->getIndex();
27316 SDValue Scale = HG->getScale();
27317 SDValue IntID = HG->getIntID();
27318
27319 // The Intrinsic ID determines the type of update operation.
27320 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
27321 // Right now, we only support 'add' as an update.
27322 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27323 "Unexpected histogram update operation");
27324
27325 EVT IncVT = Inc.getValueType();
27326 EVT IndexVT = Index.getValueType();
27327 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
27328 IndexVT.getVectorElementCount());
27329 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27330 SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
27331 SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
27332 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27333
27334 MachineMemOperand *MMO = HG->getMemOperand();
27335 // Create an MMO for the gather, without load|store flags.
27338 MMO->getAlign(), MMO->getAAInfo());
27339 ISD::MemIndexType IndexType = HG->getIndexType();
27340 SDValue Gather =
27341 DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
27342 GMMO, IndexType, ISD::NON_EXTLOAD);
27343
27344 SDValue GChain = Gather.getValue(1);
27345
27346 // Perform the histcnt, multiply by inc, add to bucket data.
27347 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
27348 SDValue HistCnt =
27349 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
27350 SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
27351 SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
27352
27353 // Create an MMO for the scatter, without load|store flags.
27356 MMO->getAlign(), MMO->getAAInfo());
27357
27358 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
27359 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
27360 ScatterOps, SMMO, IndexType, false);
27361 return Scatter;
27362}
27363
27364SDValue
27365AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27366 SelectionDAG &DAG) const {
27367 EVT VT = Op.getValueType();
27368 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27369
27370 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27371 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27373
27374 SDLoc DL(Op);
27375 SDValue Val = Op.getOperand(0);
27376 EVT SrcVT = Val.getValueType();
27377 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27378 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27379
27380 if (VT.bitsGT(SrcVT)) {
27381 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27382 ContainerSrcVT.getVectorElementType());
27384
27385 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27386 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27387
27388 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27389 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27390 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27391 DAG.getUNDEF(ContainerDstVT));
27392 return convertFromScalableVector(DAG, VT, Val);
27393 } else {
27394 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27396
27397 // Safe to use a larger than specified result since an fp_to_int where the
27398 // result doesn't fit into the destination is undefined.
27399 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27400 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27401 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27402
27403 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27404 }
27405}
27406
27408 ArrayRef<int> ShuffleMask, EVT VT,
27409 EVT ContainerVT, SelectionDAG &DAG) {
27410 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27411 SDLoc DL(Op);
27412 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27413 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27414 bool IsSingleOp =
27415 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27416
27417 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27418 MinSVESize = 128;
27419
27420 // Ignore two operands if no SVE2 or all index numbers couldn't
27421 // be represented.
27422 if (!IsSingleOp && !Subtarget.hasSVE2())
27423 return SDValue();
27424
27425 EVT VTOp1 = Op.getOperand(0).getValueType();
27426 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27427 unsigned IndexLen = MinSVESize / BitsPerElt;
27428 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27429 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27430 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27431 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27432 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27433 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27434 "Incorrectly legalised shuffle operation");
27435
27437 // If MinSVESize is not equal to MaxSVESize then we need to know which
27438 // TBL mask element needs adjustment.
27439 SmallVector<SDValue, 8> AddRuntimeVLMask;
27440
27441 // Bail out for 8-bits element types, because with 2048-bit SVE register
27442 // size 8 bits is only sufficient to index into the first source vector.
27443 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27444 return SDValue();
27445
27446 for (int Index : ShuffleMask) {
27447 // Handling poison index value.
27448 if (Index < 0)
27449 Index = 0;
27450 // If the mask refers to elements in the second operand, then we have to
27451 // offset the index by the number of elements in a vector. If this is number
27452 // is not known at compile-time, we need to maintain a mask with 'VL' values
27453 // to add at runtime.
27454 if ((unsigned)Index >= ElementsPerVectorReg) {
27455 if (MinMaxEqual) {
27456 Index += IndexLen - ElementsPerVectorReg;
27457 } else {
27458 Index = Index - ElementsPerVectorReg;
27459 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27460 }
27461 } else if (!MinMaxEqual)
27462 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27463 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27464 // to 255, this might point to the last element of in the second operand
27465 // of the shufflevector, thus we are rejecting this transform.
27466 if ((unsigned)Index >= MaxOffset)
27467 return SDValue();
27468 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27469 }
27470
27471 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27472 // value where it would perform first lane duplication for out of
27473 // index elements. For i8 elements an out-of-range index could be a valid
27474 // for 2048-bit vector register size.
27475 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27476 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27477 if (!MinMaxEqual)
27478 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27479 }
27480
27481 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27482 SDValue VecMask =
27483 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27484 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27485
27486 SDValue Shuffle;
27487 if (IsSingleOp)
27488 Shuffle =
27489 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27490 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27491 Op1, SVEMask);
27492 else if (Subtarget.hasSVE2()) {
27493 if (!MinMaxEqual) {
27494 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27495 SDValue VScale = (BitsPerElt == 64)
27496 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27497 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27498 SDValue VecMask =
27499 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27500 SDValue MulByMask = DAG.getNode(
27501 ISD::MUL, DL, MaskType,
27502 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27503 DAG.getBuildVector(MaskType, DL,
27504 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27505 SDValue UpdatedVecMask =
27506 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27507 SVEMask = convertToScalableVector(
27508 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27509 }
27510 Shuffle =
27511 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27512 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27513 Op1, Op2, SVEMask);
27514 }
27515 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27516 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27517}
27518
27519SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27520 SDValue Op, SelectionDAG &DAG) const {
27521 EVT VT = Op.getValueType();
27522 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27523
27524 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27525 auto ShuffleMask = SVN->getMask();
27526
27527 SDLoc DL(Op);
27528 SDValue Op1 = Op.getOperand(0);
27529 SDValue Op2 = Op.getOperand(1);
27530
27531 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27532 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27533 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27534
27535 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27536 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27537 return MVT::i32;
27538 return ScalarTy;
27539 };
27540
27541 if (SVN->isSplat()) {
27542 unsigned Lane = std::max(0, SVN->getSplatIndex());
27543 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27544 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27545 DAG.getConstant(Lane, DL, MVT::i64));
27546 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27547 return convertFromScalableVector(DAG, VT, Op);
27548 }
27549
27550 bool ReverseEXT = false;
27551 unsigned Imm;
27552 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27553 Imm == VT.getVectorNumElements() - 1) {
27554 if (ReverseEXT)
27555 std::swap(Op1, Op2);
27556 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27557 SDValue Scalar = DAG.getNode(
27558 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27559 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27560 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27561 return convertFromScalableVector(DAG, VT, Op);
27562 }
27563
27564 unsigned EltSize = VT.getScalarSizeInBits();
27565 for (unsigned LaneSize : {64U, 32U, 16U}) {
27566 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
27567 EVT NewVT =
27569 unsigned RevOp;
27570 if (EltSize == 8)
27572 else if (EltSize == 16)
27574 else
27576
27577 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27578 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27579 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27580 return convertFromScalableVector(DAG, VT, Op);
27581 }
27582 }
27583
27584 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
27585 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
27586 if (!VT.isFloatingPoint())
27587 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27588
27590 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27591 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27592 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27593 return convertFromScalableVector(DAG, VT, Op);
27594 }
27595
27596 unsigned WhichResult;
27597 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
27598 WhichResult == 0)
27600 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27601
27602 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
27603 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27605 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27606 }
27607
27608 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27610 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27611
27612 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27613 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27615 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27616 }
27617
27618 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27619 // represents the same logical operation as performed by a ZIP instruction. In
27620 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27621 // equivalent to an AArch64 instruction. There's the extra component of
27622 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27623 // only operated on 64/128bit vector types that have a direct mapping to a
27624 // target register and so an exact mapping is implied.
27625 // However, when using SVE for fixed length vectors, most legal vector types
27626 // are actually sub-vectors of a larger SVE register. When mapping
27627 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27628 // how the mask's indices translate. Specifically, when the mapping requires
27629 // an exact meaning for a specific vector index (e.g. Index X is the last
27630 // vector element in the register) then such mappings are often only safe when
27631 // the exact SVE register size is know. The main exception to this is when
27632 // indices are logically relative to the first element of either
27633 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27634 // when converting from fixed-length to scalable vector types (i.e. the start
27635 // of a fixed length vector is always the start of a scalable vector).
27636 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27637 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27638 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27639 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27640 Op2.isUndef()) {
27641 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27642 return convertFromScalableVector(DAG, VT, Op);
27643 }
27644
27645 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
27646 WhichResult != 0)
27648 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27649
27650 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
27651 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27653 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27654 }
27655
27656 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27658 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27659
27660 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27661 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27663 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27664 }
27665 }
27666
27667 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27668 // unless NEON is not available and we can assume minimal SVE register size is
27669 // 128-bits.
27670 if (MinSVESize || !Subtarget->isNeonAvailable())
27671 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27672 DAG);
27673
27674 return SDValue();
27675}
27676
27677SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27678 SelectionDAG &DAG) const {
27679 SDLoc DL(Op);
27680 EVT InVT = Op.getValueType();
27681
27682 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27683 InVT.isScalableVector() && isTypeLegal(InVT) &&
27684 "Only expect to cast between legal scalable vector types!");
27685 assert(VT.getVectorElementType() != MVT::i1 &&
27686 InVT.getVectorElementType() != MVT::i1 &&
27687 "For predicate bitcasts, use getSVEPredicateBitCast");
27688
27689 if (InVT == VT)
27690 return Op;
27691
27693 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27694
27695 // Safe bitcasting between unpacked vector types of different element counts
27696 // is currently unsupported because the following is missing the necessary
27697 // work to ensure the result's elements live where they're supposed to within
27698 // an SVE register.
27699 // 01234567
27700 // e.g. nxv2i32 = XX??XX??
27701 // nxv4f16 = X?X?X?X?
27703 VT == PackedVT || InVT == PackedInVT) &&
27704 "Unexpected bitcast!");
27705
27706 // Pack input if required.
27707 if (InVT != PackedInVT)
27708 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27709
27710 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27711
27712 // Unpack result if required.
27713 if (VT != PackedVT)
27715
27716 return Op;
27717}
27718
27720 SDValue N) const {
27721 return ::isAllActivePredicate(DAG, N);
27722}
27723
27725 return ::getPromotedVTForPredicate(VT);
27726}
27727
27728bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27729 SDValue Op, const APInt &OriginalDemandedBits,
27730 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27731 unsigned Depth) const {
27732
27733 unsigned Opc = Op.getOpcode();
27734 switch (Opc) {
27735 case AArch64ISD::VSHL: {
27736 // Match (VSHL (VLSHR Val X) X)
27737 SDValue ShiftL = Op;
27738 SDValue ShiftR = Op->getOperand(0);
27739 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27740 return false;
27741
27742 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27743 return false;
27744
27745 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27746 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27747
27748 // Other cases can be handled as well, but this is not
27749 // implemented.
27750 if (ShiftRBits != ShiftLBits)
27751 return false;
27752
27753 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27754 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27755
27756 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27757 APInt UnusedBits = ~OriginalDemandedBits;
27758
27759 if ((ZeroBits & UnusedBits) != ZeroBits)
27760 return false;
27761
27762 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27763 // used - simplify to just Val.
27764 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27765 }
27766 case AArch64ISD::BICi: {
27767 // Fold BICi if all destination bits already known to be zeroed
27768 SDValue Op0 = Op.getOperand(0);
27769 KnownBits KnownOp0 =
27770 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
27771 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27772 uint64_t BitsToClear = Op->getConstantOperandVal(1)
27773 << Op->getConstantOperandVal(2);
27774 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27775 if (APInt(Known.getBitWidth(), BitsToClear)
27776 .isSubsetOf(AlreadyZeroedBitsToClear))
27777 return TLO.CombineTo(Op, Op0);
27778
27779 Known = KnownOp0 &
27780 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
27781
27782 return false;
27783 }
27785 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27786 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27787 if (!MaxSVEVectorSizeInBits)
27788 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27789 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27790 // The SVE count intrinsics don't support the multiplier immediate so we
27791 // don't have to account for that here. The value returned may be slightly
27792 // over the true required bits, as this is based on the "ALL" pattern. The
27793 // other patterns are also exposed by these intrinsics, but they all
27794 // return a value that's strictly less than "ALL".
27795 unsigned RequiredBits = llvm::bit_width(MaxElements);
27796 unsigned BitWidth = Known.Zero.getBitWidth();
27797 if (RequiredBits < BitWidth)
27798 Known.Zero.setHighBits(BitWidth - RequiredBits);
27799 return false;
27800 }
27801 }
27802 }
27803
27805 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27806}
27807
27808bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27809 return Op.getOpcode() == AArch64ISD::DUP ||
27810 Op.getOpcode() == AArch64ISD::MOVI ||
27811 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27812 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27814}
27815
27817 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27818 Subtarget->hasComplxNum();
27819}
27820
27823 auto *VTy = dyn_cast<VectorType>(Ty);
27824 if (!VTy)
27825 return false;
27826
27827 // If the vector is scalable, SVE is enabled, implying support for complex
27828 // numbers. Otherwise, we need to ensure complex number support is available
27829 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27830 return false;
27831
27832 auto *ScalarTy = VTy->getScalarType();
27833 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27834
27835 // We can only process vectors that have a bit size of 128 or higher (with an
27836 // additional 64 bits for Neon). Additionally, these vectors must have a
27837 // power-of-2 size, as we later split them into the smallest supported size
27838 // and merging them back together after applying complex operation.
27839 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27840 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27841 !llvm::isPowerOf2_32(VTyWidth))
27842 return false;
27843
27844 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27845 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27846 return 8 <= ScalarWidth && ScalarWidth <= 64;
27847 }
27848
27849 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27850 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27851}
27852
27855 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27856 Value *Accumulator) const {
27857 VectorType *Ty = cast<VectorType>(InputA->getType());
27858 bool IsScalable = Ty->isScalableTy();
27859 bool IsInt = Ty->getElementType()->isIntegerTy();
27860
27861 unsigned TyWidth =
27863
27864 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27865 "Vector type must be either 64 or a power of 2 that is at least 128");
27866
27867 if (TyWidth > 128) {
27868 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27869 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27870 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27871 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27872 auto *UpperSplitA =
27873 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27874 auto *UpperSplitB =
27875 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27876 Value *LowerSplitAcc = nullptr;
27877 Value *UpperSplitAcc = nullptr;
27878 if (Accumulator) {
27879 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27880 UpperSplitAcc =
27881 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27882 }
27883 auto *LowerSplitInt = createComplexDeinterleavingIR(
27884 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27885 auto *UpperSplitInt = createComplexDeinterleavingIR(
27886 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27887
27888 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27889 B.getInt64(0));
27890 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27891 }
27892
27893 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27894 if (Accumulator == nullptr)
27896
27897 if (IsScalable) {
27898 if (IsInt)
27899 return B.CreateIntrinsic(
27900 Intrinsic::aarch64_sve_cmla_x, Ty,
27901 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27902
27903 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27904 return B.CreateIntrinsic(
27905 Intrinsic::aarch64_sve_fcmla, Ty,
27906 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27907 }
27908
27909 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27910 Intrinsic::aarch64_neon_vcmla_rot90,
27911 Intrinsic::aarch64_neon_vcmla_rot180,
27912 Intrinsic::aarch64_neon_vcmla_rot270};
27913
27914
27915 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27916 {Accumulator, InputA, InputB});
27917 }
27918
27919 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27920 if (IsScalable) {
27923 if (IsInt)
27924 return B.CreateIntrinsic(
27925 Intrinsic::aarch64_sve_cadd_x, Ty,
27926 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27927
27928 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27929 return B.CreateIntrinsic(
27930 Intrinsic::aarch64_sve_fcadd, Ty,
27931 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27932 }
27933 return nullptr;
27934 }
27935
27938 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27940 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27941
27942 if (IntId == Intrinsic::not_intrinsic)
27943 return nullptr;
27944
27945 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27946 }
27947
27948 return nullptr;
27949}
27950
27951bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27952 unsigned Opc = N->getOpcode();
27953 if (ISD::isExtOpcode(Opc)) {
27954 if (any_of(N->uses(),
27955 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27956 return false;
27957 }
27958 return true;
27959}
27960
27961unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27962 return Subtarget->getMinimumJumpTableEntries();
27963}
27964
27967 EVT VT) const {
27968 bool NonUnitFixedLengthVector =
27970 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27972
27973 EVT VT1;
27974 MVT RegisterVT;
27975 unsigned NumIntermediates;
27976 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27977 RegisterVT);
27978 return RegisterVT;
27979}
27980
27982 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27983 bool NonUnitFixedLengthVector =
27985 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27987
27988 EVT VT1;
27989 MVT VT2;
27990 unsigned NumIntermediates;
27992 NumIntermediates, VT2);
27993}
27994
27996 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27997 unsigned &NumIntermediates, MVT &RegisterVT) const {
27999 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
28000 if (!RegisterVT.isFixedLengthVector() ||
28001 RegisterVT.getFixedSizeInBits() <= 128)
28002 return NumRegs;
28003
28004 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
28005 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
28006 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
28007
28008 // A size mismatch here implies either type promotion or widening and would
28009 // have resulted in scalarisation if larger vectors had not be available.
28010 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
28011 EVT EltTy = VT.getVectorElementType();
28013 if (!isTypeLegal(NewVT))
28014 NewVT = EltTy;
28015
28016 IntermediateVT = NewVT;
28017 NumIntermediates = VT.getVectorNumElements();
28018 RegisterVT = getRegisterType(Context, NewVT);
28019 return NumIntermediates;
28020 }
28021
28022 // SVE VLS support does not introduce a new ABI so we should use NEON sized
28023 // types for vector arguments and returns.
28024
28025 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
28026 NumIntermediates *= NumSubRegs;
28027 NumRegs *= NumSubRegs;
28028
28029 switch (RegisterVT.getVectorElementType().SimpleTy) {
28030 default:
28031 llvm_unreachable("unexpected element type for vector");
28032 case MVT::i8:
28033 IntermediateVT = RegisterVT = MVT::v16i8;
28034 break;
28035 case MVT::i16:
28036 IntermediateVT = RegisterVT = MVT::v8i16;
28037 break;
28038 case MVT::i32:
28039 IntermediateVT = RegisterVT = MVT::v4i32;
28040 break;
28041 case MVT::i64:
28042 IntermediateVT = RegisterVT = MVT::v2i64;
28043 break;
28044 case MVT::f16:
28045 IntermediateVT = RegisterVT = MVT::v8f16;
28046 break;
28047 case MVT::f32:
28048 IntermediateVT = RegisterVT = MVT::v4f32;
28049 break;
28050 case MVT::f64:
28051 IntermediateVT = RegisterVT = MVT::v2f64;
28052 break;
28053 case MVT::bf16:
28054 IntermediateVT = RegisterVT = MVT::v8bf16;
28055 break;
28056 }
28057
28058 return NumRegs;
28059}
28060
28062 const MachineFunction &MF) const {
28063 return !Subtarget->isTargetWindows() &&
28064 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28065}
28066
28067#ifndef NDEBUG
28069 switch (N->getOpcode()) {
28070 default:
28071 break;
28075 case AArch64ISD::UUNPKHI: {
28076 assert(N->getNumValues() == 1 && "Expected one result!");
28077 assert(N->getNumOperands() == 1 && "Expected one operand!");
28078 EVT VT = N->getValueType(0);
28079 EVT OpVT = N->getOperand(0).getValueType();
28080 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28081 VT.isInteger() && "Expected integer vectors!");
28082 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28083 "Expected vectors of equal size!");
28084 // TODO: Enable assert once bogus creations have been fixed.
28085 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
28086 // "Expected result vector with half the lanes of its input!");
28087 break;
28088 }
28089 case AArch64ISD::TRN1:
28090 case AArch64ISD::TRN2:
28091 case AArch64ISD::UZP1:
28092 case AArch64ISD::UZP2:
28093 case AArch64ISD::ZIP1:
28094 case AArch64ISD::ZIP2: {
28095 assert(N->getNumValues() == 1 && "Expected one result!");
28096 assert(N->getNumOperands() == 2 && "Expected two operands!");
28097 EVT VT = N->getValueType(0);
28098 EVT Op0VT = N->getOperand(0).getValueType();
28099 EVT Op1VT = N->getOperand(1).getValueType();
28100 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28101 "Expected vectors!");
28102 // TODO: Enable assert once bogus creations have been fixed.
28103 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28104 break;
28105 }
28106 }
28107}
28108#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1596
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1710
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1919
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
arg_iterator arg_end()
Definition: Function.h:827
arg_iterator arg_begin()
Definition: Function.h:818
size_t size() const
Definition: Function.h:808
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:262
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:528
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:295
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1037
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2067
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1437
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:476
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2081
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1416
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:690
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:862
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:462
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:676
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:634
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:251
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1346
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1377
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1028
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1362
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1366
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1032
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1376
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1412
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:663
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1359
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1363
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ STRICT_LROUND
Definition: ISDOpcodes.h:432
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:587
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:647
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ STRICT_FPOWI
Definition: ISDOpcodes.h:414
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1378
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1371
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1023
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1273
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1336
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1255
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ STRICT_LRINT
Definition: ISDOpcodes.h:434
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:592
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1379
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:658
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:435
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:613
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:433
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1408
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1367
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:581
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1606
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1497
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1484
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1535
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1515
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1486
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1521
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:434
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:376
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:291
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64